Supplemental code used for generating results for the manuscript Epigenetic blueprint identifies poor outcome and hypomethylating agent-responsive T-ALL subgroup. Following code is written in R. Please refer to the end of the document for sessionInfo regarding package versions.
#For data import/wrangling
library(data.table)
library(readxl)
#For Diff. Methylation and Expression analysis
library(limma)
library(DESeq2)
library(sva)
library(BiocParallel)
register(MulticoreParam(8))
#For enrichment analysis
library(goseq)
library(maftools)
library(GenomicRanges)
library(LOLA)
#For ontogeny analysis
library(ape)
#For Clustering
library(NMF)
library(fossil) #For rand.index function
library(umap)
library(parallel)
#For survival
library(survival)
#For visualization
library(pheatmap)
suppressPackageStartupMessages(library(circlize))
suppressPackageStartupMessages(library(ComplexHeatmap))
library(RColorBrewer)
library(ggplot2)
library(peakSeason) #https://github.com/PoisonAlien/peakseason
#Modified pheatmap which avoids repetitive legends
source("code/pheatmap2.R")
#Accessory helpful functions
source("code/helper_functions.R")
#Wrapper function for running NMF
source("code/extract_componenets.R")
Raw IDAT files and processed beta value matrices are available on GEO under the accession GSE147667. Below analysis begins with the processed beta values.
#Processed Tumor and normal beta values - stored as ExpressionSet objects
tumor_data = readRDS(file = "data/t-alls_eset.RDs")
print(tumor_data)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 779812 features, 143 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: sample_231 sample_234 ... sample_RAD_MI (143 total)
## varLabels: Sample_Name Age ... HOXA_nd (52 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg26928153 cg16269199 ... cg02995750 (779812 total)
## fvarLabels: rn Chr ... Relation_to_Island (12 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
normal_data = readRDS(file = "data/thymic_normals_eset.RDs")
print(normal_data)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 779812 features, 12 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: sam_CD34_1 sam_CD34_2 ... sam_SPo8_2 (12 total)
## varLabels: Sample_Name Age ... HOXA_nd (52 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg26928153 cg16269199 ... cg02995750 (779812 total)
## fvarLabels: rn Chr ... Relation_to_Island (12 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
#Uniform color-codes for visualization.
color_codes = readRDS(file = "data/color_codes.RDs")
Remove probes located on CN altered regions such as amplicons or deletions. Such probes can lead to false methylation values thereby affecting the analysis. Each of the EPIC array was analyzed with conumee package and segmentation files were generated. Segmentation files from all the samples are further processed with GISTIC2.0 to identify genomic regions that are recurrently and significantly amplified or deleted across the cohort.
gistic_res = maftools::readGistic(gisticAllLesionsFile = "data/gistic/all_lesions.conf_90.txt",
gisticAmpGenesFile = "data/gistic/amp_genes.conf_90.txt",
gisticDelGenesFile = "data/gistic/del_genes.conf_90.txt",
gisticScoresFile = "data/gistic//scores.gistic")
## -Processing Gistic files..
## --Processing amp_genes.conf_90.txt
## --Processing del_genes.conf_90.txt
## --Processing scores.gistic
## --Summarizing by samples
#CNV region limits
cnv_regions = maftools::getCytobandSummary(x = gistic_res)[,.(Wide_Peak_Limits, Variant_Classification)]
cnv_regions[, chr := data.table::tstrsplit(x = Wide_Peak_Limits, split = ":")[[1]]]
cnv_regions[, region := data.table::tstrsplit(x = Wide_Peak_Limits, split = ":")[[2]]]
cnv_regions[, start := data.table::tstrsplit(x = cnv_regions$region, split = "-")[[1]]]
cnv_regions[, end := data.table::tstrsplit(x = cnv_regions$region, split = "-")[[2]]]
cnv_regions[, region := NULL]
cnv_regions[, start := as.numeric(start)]
cnv_regions[, end := as.numeric(end)]
cnv_regions = cnv_regions[,.(chr, start, end, Variant_Classification, Wide_Peak_Limits)]
data.table::setkey(x = cnv_regions, chr, start, end)
#EPIC probe annotations
epic_anno = data.table::fread(input = "data/epic_genomic_annotations.tsv.gz")
epic_anno = epic_anno[,.(Chr, Start, End, Strand, rn)]
colnames(epic_anno)[1:3] = c('chr', 'start', 'end')
epic_anno[, start := as.numeric(start)]
epic_anno[, end := as.numeric(end)]
data.table::setkey(x = epic_anno, chr, start, end)
#Probes within CN altered probes
cn_probes = data.table::foverlaps(x = cnv_regions, y = epic_anno)
cn_probes = cn_probes[!duplicated(rn)]
message("Number of porbes on CN altered regions: ", nrow(cn_probes))
## Number of porbes on CN altered regions: 11363
print(cn_probes[,.N,Variant_Classification])
## Variant_Classification N
## 1: Del 8020
## 2: Amp 3343
cn_altered_probes = unique(cn_probes$rn)
#Remove copy-number altered probes
tumor_data_clean = exprs(tumor_data[!rownames(tumor_data) %in% cn_altered_probes, ])
#Estimate SD on tumor samples alone
tumor_data_clean = order_by_sds(mat = tumor_data_clean, keep_sd = FALSE)
#Using top 5% most variable data
n_rows = as.integer(nrow(tumor_data_clean) * 0.05)
clust_dat = tumor_data_clean[1:n_rows,]
print(dim(clust_dat))
## [1] 38583 143
Here we use NMF to estimate optimal number of components from a mixture of cohort.
#Range of probes to try
trial_nrows = c(seq(5000, 35000, 5000), nrow(clust_dat))
print(trial_nrows)
## [1] 5000 10000 15000 20000 25000 30000 35000 38583
#estimate_components function sourced from `extract_componenets.R`
nmf_trial_nrows = lapply(trial_nrows, function(nrows) {
nmf_run = estimate_componenets(
X = as.matrix(x = clust_dat[1:nrows, ]),
n_run = 5,
n_threads = 8,
n_max = 10
)
nmf_run
})
names(nmf_trial_nrows) = paste0("n_", trial_nrows)
Cophenetic correlation is a measure of goodness of fit.
row_cols = RColorBrewer::brewer.pal(n = 9, name = "YlOrRd")[2:9]
par(mar = c(4, 5, 1, 1), cex = 0.8)
plot(nmf_trial_nrows[[1]]$rank, nmf_trial_nrows[[1]]$cophenetic , axes = FALSE, pch = 16, col = row_cols[1], xlab = NA, ylab = NA, ylim = c(0.94, 1), xlim = c(2, 10))
abline(h = seq(0.94, 1, 0.01), v = 2:10, col = grDevices::adjustcolor(col = 'gray70', alpha.f = 0.7), lty = 2)
lines(nmf_trial_nrows[[1]]$rank, nmf_trial_nrows[[1]]$cophenetic, lwd = 1.2, col = row_cols[1])
points(nmf_trial_nrows[[1]]$rank, nmf_trial_nrows[[1]]$cophenetic, pch = 19, col = row_cols[1])
for(i in 2:length(nmf_trial_nrows)){
lines(x = nmf_trial_nrows[[i]]$rank, nmf_trial_nrows[[i]]$cophenetic, col = row_cols[i])
points(nmf_trial_nrows[[i]]$rank, nmf_trial_nrows[[i]]$cophenetic, pch = 19, col = row_cols[i])
}
axis(side = 1, at = 2:10, labels = 2:10, font = 1)
axis(side = 2, at = seq(0.94, 1, 0.01), font = 1, las = 2, cex = 1.4)
mtext(text = "Number of components", side = 1, line = 2.5, adj = 0, cex = 0.8)
mtext(text = "Cophenetic-correlation metric", side = 2, line = 3.5, adj = 0, cex = 0.8)
legend(x = 5, y = 0.97, legend = trial_nrows, col = row_cols, pch = 19, cex = 0.6)
Based on above elbow plot of Cophenetic-correlation metric, optimal number of components seem to be 5 - since after 5 there is a marginal change, followed by gradual decline.
Re-run nmf at n=5 for above range of probes to access cluster stability
nmf_trial_nrows_k5 = lapply(trial_nrows, function(nrows) {
nmf_run = extract_componenets(
X = as.matrix(x = clust_dat[1:nrows, ]),
n_run = 5,
n_threads = 8,
n = 5
)
nmf_run
})
#Perform k-mean clustering on contributions
nmf_k5_clusters = lapply(nmf_trial_nrows_k5, function(x){
set.seed(seed = 1024)
x_clust = kmeans(x = t(x$contributions), centers = 5)
x_clust$cluster
})
#Get cluster assignments from k-mean results
nmf_k5_clusters = data.frame(
row.names = colnames(nmf_trial_nrows_k5[[1]]$contributions),
cluster = data.frame(nmf_k5_clusters),
stringsAsFactors = FALSE
)
colnames(nmf_k5_clusters) = paste0("nprobes_", trial_nrows)
Check how stable the clusters are by estimating Rand-index between all k-mean clustering results
#Pairwise rand-index estimation
interactions = sapply(1:ncol(nmf_k5_clusters), function(i) {
sapply(1:ncol(nmf_k5_clusters), function(j) {
fossil::rand.index(group1 = nmf_k5_clusters[, i], group2 = nmf_k5_clusters[, j])
})
})
#Draw rand-index
rownames(interactions) = colnames(interactions) = colnames(nmf_k5_clusters)
interactions[lower.tri(x = interactions)] = NA
m <- nrow(interactions)
n <- ncol(interactions)
par(bty="n", mar = c(1, 4, 2, 5)+.1, las=2, tcl=-.33, cex = 0.6)
image(
x = 1:n,
y = 1:m,
interactions,
col = rev(RColorBrewer::brewer.pal(9, "PRGn")),
xaxt = "n",
yaxt = "n",
xlab = "",
ylab = "",
xlim = c(0, n+2),
ylim = c(0, n+2)
)
abline(h=0:n+.5, col="white", lwd=.5)
abline(v=0:n+.5, col="white", lwd=.5)
image(y = seq(0.5*nrow(interactions), 0.9*nrow(interactions), length.out = 8), x=rep(n,2)+c(2,2.5)+1, z=matrix(c(1:8), nrow=1), col = rev(RColorBrewer::brewer.pal(8,"PRGn")), add=TRUE)
atLims = seq(0.5*nrow(interactions), 0.9*nrow(interactions), length.out = 7)
atLimsLabs = round(seq(range(interactions, na.rm = T)[1], range(interactions, na.rm = T)[2], length.out = 7), 2)
axis(side = 4, at = atLims, tcl = -.15, labels = atLimsLabs, las = 1, lwd = .5)
mtext(side = 4, at = median(atLims), "Rand Index", las = 3, cex = 0.9, line = 3, font = 2)
mtext(side = 2, at = 1:m, text = gsub(pattern = "nprobes_", replacement = "", x = colnames(interactions)), cex = 0.7, font = 1)
mtext(side = 3, at = 1:n, text = gsub(pattern = "nprobes_", replacement = "", x = colnames(interactions)), las = 2, cex = 0.7, font = 1, line = -2)
This is a simple visualization to see the movement of samples for different clustering results.
#Use clusters from nprobes_38990 as a control
nmf_k5_clusters = nmf_k5_clusters[order(nmf_k5_clusters$nprobes_38583),,]
clust_dat = clust_dat[,rownames(nmf_k5_clusters)]
clust_dat_cor_mat = cor(x = clust_dat, method = "pearson")
#Cluster annotations
nmf_k5_clusters_anno = apply(nmf_k5_clusters, 2, function(x) paste0("K_", x))
rownames(nmf_k5_clusters_anno) = rownames(nmf_k5_clusters)
#Cluster colors
clust_cols = lapply(1:ncol(nmf_k5_clusters_anno), function(x) {
clust_cols = RColorBrewer::brewer.pal(n = 5, name = "Set1")
names(clust_cols) = paste0("K_", 1:5)
clust_cols
})
names(clust_cols) = colnames(nmf_k5_clusters_anno)
clust_dat_cor_mat[lower.tri(x = clust_dat_cor_mat, diag = TRUE)] = NA
pheatmap(mat = clust_dat_cor_mat, cluster_rows = FALSE, cluster_cols = FALSE, show_rownames = FALSE, show_colnames = FALSE, annotation_col = nmf_k5_clusters, border_color = "white", fontsize = 7, indiv_legend = list(Clusters = c("nprobes_5000", "nprobes_10000", 'nprobes_15000', "nprobes_20000", "nprobes_25000", "nprobes_30000", "nprobes_35000")), annotation_colors = clust_cols, na_col = 'white')
## Loading required package: scales
## Loading required package: gtable
#Final clusters to be used in downstream analysis
data.table::setDT(x = nmf_k5_clusters, keep.rownames = TRUE)
final_clusters = nmf_k5_clusters[,.(rn, nprobes_38583)]
colnames(final_clusters) = c('Sample_Name', 'Cluster')
Rename clusters:
C instead of KC1, C2, and so on..mean_meth = colMeans(x = exprs(tumor_data), na.rm = TRUE)
mean_meth = data.table::data.table(Sample_Name = names(mean_meth), avg_meth = mean_meth, stringsAsFactors = FALSE)
mean_meth = merge(mean_meth, final_clusters)
mean_meth[,Cluster := paste0("C", Cluster)]
clust_order = mean_meth[,median(avg_meth), Cluster][order(V1, decreasing = FALSE)][,Cluster]
print(clust_order)
## [1] "C3" "C2" "C4" "C1" "C5"
#Rename and change order
mean_meth$Cluster = factor(x = mean_meth[,Cluster], levels = clust_order, labels = paste0("C", 1:5))
Final cluster sizes
mean_meth[,.N,Cluster][order(as.character(Cluster))]
## Cluster N
## 1: C1 14
## 2: C2 34
## 3: C3 22
## 4: C4 41
## 5: C5 32
#Use nmf weights (at N = 38553 probes) for UMAP visualization
nmf_contributions = nmf_trial_nrows_k5[[8]]$contributions
umap_settings = umap.defaults
umap_settings$n_neighbors = 14
umap_settings$min_dist = 0.1
umap_settings$random_state = 1024
umap_settings$n_components = 2
#UMAP layout
umap_lo = umap::umap(d = t(nmf_contributions), config = umap_settings)
umap_lo = data.table::as.data.table(as.data.frame(umap_lo$layout), keep.rownames = TRUE)
colnames(umap_lo) = c('Sample_Name', 'UMAP1', 'UMAP2')
#Points are too close to visualize UMAP data; add random noise for better separation
set.seed(seed = 1024)
umap_lo$UMAP1 = jitter(umap_lo$UMAP1, amount = 1)
set.seed(seed = 1024)
umap_lo$UMAP2 = jitter(umap_lo$UMAP2, amount = 1)
#Add custer info
umap_lo = merge(umap_lo, mean_meth[,.(Sample_Name, Cluster)], by = 'Sample_Name')
clust_cols = c(C1 = "#E41A1C", C2 = "#377EB8", C3 = "#4DAF4A", C4 = "#984EA3", C5 = "#FF7F00")
umap_lo$Cluster_cols = clust_cols[as.character(umap_lo$Cluster)]
par(mar = c(1, 1, 1, 1))
plot(NA, pch = 21, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(umap_lo$UMAP1), ylim = range(umap_lo$UMAP2))
abline(h = seq(range(umap_lo$UMAP2)[1], range(umap_lo$UMAP2)[2], length.out = 5), col = "gray70", lty = 2)
abline(v = seq(range(umap_lo$UMAP1)[1], range(umap_lo$UMAP1)[2], length.out = 5), col = "gray70", lty = 2)
points(x = umap_lo$UMAP1, y = umap_lo$UMAP2, col = "black", bg = grDevices::adjustcolor(umap_lo$Cluster_cols, alpha.f = 0.8), pch = 21)
legend("bottomright", legend = names(clust_cols), col = clust_cols, pch = 19, cex = 0.6)
Add cluster information to ExpressionSet
data.table::setDF(mean_meth, mean_meth$Sample_Name)
mean_meth = mean_meth[tumor_data$Sample_Name,]
tumor_data$Cluster = as.factor(as.character(mean_meth$Cluster))
#Cleanup
rm(tumor_data_clean)
rm(clust_dat)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 10223168 546.0 16481394 880.3 16481394 880.3
## Vcells 166383550 1269.5 557565041 4253.9 537680612 4102.2
tumor_data = order_by_sds(tumor_data)
samp_order = order(as.character(tumor_data$Cluster))
clust_dat_cor_mat = cor(x = exprs(tumor_data)[1:n_rows, samp_order], method = "spearman")
clust_dat_cor_mat[lower.tri(x = clust_dat_cor_mat, diag = FALSE)] = NA
pheatmap::pheatmap(
mat = clust_dat_cor_mat,
cluster_rows = FALSE,
cluster_cols = FALSE,
show_rownames = FALSE,
show_colnames = FALSE,
annotation_col = pData(tumor_data)[samp_order, "Cluster", drop = FALSE],
border_color = "white",
fontsize = 7,
annotation_colors = color_codes["Cluster"], na_col = "white"
)
mean_meth_n = colMeans(x = exprs(normal_data), na.rm = TRUE)
mean_meth_n = data.table::data.table(Sample_Name = names(mean_meth_n), avg_meth = mean_meth_n, Cluster = "Normal", stringsAsFactors = FALSE)
mean_meth = data.table::rbindlist(list(mean_meth, mean_meth_n))
mean_meth$Cluster = factor(x = mean_meth$Cluster, levels = c("Normal", paste0("C", 1:5)))
mean_meth$Cluster_codes = color_codes$Cluster[as.character(mean_meth$Cluster)]
#changes between groups
pt_test = pairwise.t.test(x = mean_meth$avg_meth, g = mean_meth$Cluster, paired = FALSE)
print(pt_test)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: mean_meth$avg_meth and mean_meth$Cluster
##
## Normal C1 C2 C3 C4
## C1 1.0000 - - - -
## C2 1.0000 1.0000 - - -
## C3 0.0031 0.0021 0.0016 - -
## C4 4.6e-07 1.0e-07 4.2e-10 0.0827 -
## C5 2.3e-11 2.4e-12 3.2e-16 3.2e-05 0.0144
##
## P value adjustment method: holm
grid_cols = "gray80"
par(mar = c(3, 3, 1, 1))
b = beanplot::beanplot(avg_meth ~ Cluster, data = mean_meth, xaxt="n", boxwex=0.5, outline=FALSE, lty=1, lwd = 1.4, outwex = 0,
staplewex = 0, ylim = c(0.5, 1), axes = FALSE, border = "black", ll = 0.05, innerborder = NA, col = lapply(rev(color_codes$Cluster), function(x) c(x, "gray")))
abline(v = 1:6, col = grid_cols, lty = 2)
abline(h = seq(0.5, 1, 0.1), col = grid_cols, lty = 2)
axis(side = 2, at = seq(0.5, 1, 0.1), font = 1, las = 2, cex.axis = 0.8)
axis(side = 1, at = 1:length(b$names), font = 1, labels = b$names, cex.axis = 0.62)
mtext(text = "Cluster", side = 1, font = 1, line = 2, cex = 0.8)
mtext(text = expression(paste("Average ", beta, " value")), side = 2, font = 1, line = 2, cex = 0.8)
text(x = 1:length(b$n), y = 1, labels = table(mean_meth$Cluster), font = 3, cex = 0.8)
#Add significant values
rect(xleft = 1, ybottom = 0.75, xright = 4, ytop = 0.75, col = "black")
text(x = 2.5, y = 0.76, labels = "***", cex = 0.4) #round(pt_test$p.value["C3", "Normal"], digits = 3)
rect(xleft = 1, ybottom = 0.8, xright = 5, ytop = 0.8, col = "black")
text(x = 3, y = 0.81, labels = "***", cex = 0.4) #round(pt_test$p.value["C4", "Normal"], digits = 3)
rect(xleft = 1, ybottom = 0.86, xright = 6, ytop = 0.86, col = "black")
text(x = 3.5, y = 0.87, labels = "***", cex = 0.6) #round(pt_test$p.value["C5", "Normal"], digits = 3)
#Simplify maturation arrest stages. Classify them into alpha-beta and gamma-delta llineage
tumor_data$Maturation_Arrest_Stage_simple = ifelse(
test = tumor_data$Maturation_Arrest_Stage %in% c("IMG", "IMD", "IM0"),
yes = "GD_lineage",
no = ifelse(
test = tumor_data$Maturation_Arrest_Stage %in% c("TCR_AB_Pos", "IMB"),
yes = "AB_lineage",
no = tumor_data$Maturation_Arrest_Stage
)
)
table(tumor_data$Maturation_Arrest_Stage_simple, tumor_data$Maturation_Arrest_Stage)
##
## IM0 IMB IMD IMG TCR_AB_Pos TCR_GD_Pos
## AB_lineage 0 66 0 0 14 0
## GD_lineage 7 0 11 15 0 0
## TCR_GD_Pos 0 0 0 0 0 14
samp_anno = pData(tumor_data)[,c("Sample_Name", "Maturation_Arrest_Stage_simple", "ETP_Phenotype", "Cluster")]
samp_anno$Sample_Name = rownames(samp_anno)
samp_anno$Maturation_Arrest_Stage_simple = ifelse(test = samp_anno$Maturation_Arrest_Stage_simple == "NA", yes = NA, no = samp_anno$Maturation_Arrest_Stage_simple)
samp_anno$ETP_Phenotype = ifelse(test = samp_anno$ETP_Phenotype == "NA", yes = NA, no = samp_anno$ETP_Phenotype)
clust_encrichments = cluster_enrichment(clust_tbl = samp_anno[, c("Sample_Name", "Cluster")],
anno_tbl = samp_anno[, c("Sample_Name", "Maturation_Arrest_Stage_simple", "ETP_Phenotype")])
## Removed 16 NAs from Maturation_Arrest_Stage_simple
## Removed 18 NAs from ETP_Phenotype
#Remove irrelevent enrichments
clust_encrichments = clust_encrichments[!Feature_lvl %in% c("Negative", "Low", "No", "NA")]
#Significant enrichments
print(clust_encrichments[p_value < 0.01])
## Cluster Feature_lvl n_featurLvl n_featurLvl2 p_value OR_low
## 1: Cluster_C5 Yes 14/13 11/87 1.932119e-05 3.313091
## 2: Cluster_C1 Yes 7/4 18/96 1.116473e-03 2.528954
## 3: Cluster_C5 GD_lineage 18/11 15/83 2.203987e-06 3.732913
## 4: Cluster_C4 AB_lineage 32/4 48/43 7.470931e-05 2.597500
## 5: Cluster_C2 AB_lineage 28/3 52/44 1.441678e-04 2.550570
## 6: Cluster_C1 GD_lineage 9/3 24/91 2.601102e-04 3.054148
## 7: Cluster_C3 TCR_GD_Pos 6/13 8/100 7.174160e-03 1.702837
## OR_high cluster Feature
## 1: Inf Cluster_C5 ETP_Phenotype
## 2: Inf Cluster_C1 ETP_Phenotype
## 3: Inf Cluster_C5 Maturation_Arrest_Stage_simple
## 4: Inf Cluster_C4 Maturation_Arrest_Stage_simple
## 5: Inf Cluster_C2 Maturation_Arrest_Stage_simple
## 6: Inf Cluster_C1 Maturation_Arrest_Stage_simple
## 7: Inf Cluster_C3 Maturation_Arrest_Stage_simple
#msa_data = samp_anno[colnames(clust_dat_cor_mat), c("Maturation_Arrest_Stage", "ETP_Phenotype", "Cluster")]
samp_anno$Cluster = as.character(samp_anno$Cluster)
msa_data = samp_anno[colnames(clust_dat_cor_mat), c("Maturation_Arrest_Stage_simple", "ETP_Phenotype", "Cluster")]
colnames(msa_data)[1] = 'Maturation_Arrest_Stage'
msa_data = msa_data[colnames(clust_dat_cor_mat),]
msa_data$Maturation_Arrest_Stage = ifelse(
test = msa_data$Maturation_Arrest_Stage == "NA",
yes = "ND",
no = msa_data$Maturation_Arrest_Stage
)
msa_lvls = c("GD_lineage", "AB_lineage", "TCR_GD_Pos")
msa_matrix = data.frame(row.names = rownames(msa_data))
for(m in msa_lvls){
msa_matrix = cbind(msa_matrix,
ifelse(
test = msa_data$Maturation_Arrest_Stage == m,
yes = 1,
no = ifelse(
test = msa_data$Maturation_Arrest_Stage == "ND",
yes = NA,
no = 0
)
))
colnames(msa_matrix)[ncol(msa_matrix)] = m
}
msa_matrix$ETP = ifelse(test = msa_data$ETP_Phenotype == "Yes", yes = 1, no = ifelse(test = msa_data$ETP_Phenotype == "No", yes = 0, no = NA))
msa_matrix = msa_matrix[,rev(c("ETP", msa_lvls))]
msa_matrix$Cluster = as.character(msa_data[rownames(msa_matrix), "Cluster"])
msa_matrix = msa_matrix[order(msa_matrix$Cluster, -msa_matrix$ETP, -msa_matrix$GD_lineage, -msa_matrix$AB_lineage, -msa_matrix$TCR_GD_Pos),]
nm2 = matrix(data = msa_matrix$Cluster, nrow = nrow(msa_matrix), ncol = 1)
msa_matrix = msa_matrix[,-which(colnames(msa_matrix) == "Cluster")]
msa_matrix[is.na(msa_matrix)] = 2 #Set NA to 2.
msa_matrix = as.matrix(msa_matrix)
#msa_matrix = cbind(cluster = as.numeric(gsub(pattern = "C", replacement = "", x = nm2))+2, msa_matrix)
clust_cols = color_codes$Cluster[1:5]
layout(mat = matrix(data = c(1, 2), nrow = 2), heights = c(5, 1))
par(mar = c(0, 4, 1, 1))
image(x = 1:nrow(msa_matrix), y = 1:ncol(msa_matrix), z = msa_matrix, col = c("white", "maroon", "gray"), axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(msa_matrix)) + 0.5, col = "white")
points(which(msa_matrix == 0, arr.ind = TRUE), pch=".", col = "black")
mtext(text = colnames(msa_matrix), side = 2, at = 1:ncol(msa_matrix), font = 3, line = 0.4, las = 2, cex = 0.7)
abline(v = (1:nrow(msa_matrix)) + 0.5, col = "white")
par(mar = c(1, 4, 0.2, 1))
image(x = 1:nrow(nm2), y = 1:ncol(nm2), z = as.matrix(as.numeric(gsub(pattern = "C", replacement = "", x = nm2))), col = clust_cols, axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(msa_matrix)) + 0.5, col = "white")
abline(v = (1:nrow(msa_matrix)) + 0.5, col = "white")
plotEnrichment(clust_encrichments, sortByCluster = TRUE, showlegend = FALSE, neg_ylim = 1, pthr = 0.01)
gen_matrix = pData(tumor_data)[,c("TAL1", "TLX1", "TLX3", "HOXA_trans", "HOXA_cis", "HOXA_nd", "Cluster")]
gen_matrix$Cluster = as.character(msa_data[rownames(gen_matrix), "Cluster"])
gen_matrix = gen_matrix[order(gen_matrix$Cluster, -gen_matrix$TAL1, -gen_matrix$TLX1, -gen_matrix$TLX3, -gen_matrix$HOXA_trans, -gen_matrix$HOXA_cis, -gen_matrix$HOXA_nd),]
gen_matrix[is.na(gen_matrix)] = 2
nm2 = matrix(data = gen_matrix$Cluster, nrow = nrow(gen_matrix), ncol = 1)
gen_matrix = gen_matrix[,-which(colnames(gen_matrix) == "Cluster")]
gen_matrix = as.matrix(gen_matrix)
layout(mat = matrix(data = c(1, 2), nrow = 2), heights = c(5, 1))
par(mar = c(0, 4, 1, 1))
image(x = 1:nrow(gen_matrix), y = 1:ncol(gen_matrix), z = gen_matrix, col = c("white", "maroon", "gray"), axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(gen_matrix)) + 0.5, col = "white")
points(which(gen_matrix == 0, arr.ind = TRUE), pch=".", col = "black")
mtext(text = colnames(gen_matrix), side = 2, at = 1:ncol(gen_matrix), font = 3, line = 0.4, las = 2, cex = 0.7)
abline(v = (1:nrow(gen_matrix)) + 0.5, col = "white")
par(mar = c(1, 4, 0.2, 1))
image(x = 1:nrow(nm2), y = 1:ncol(nm2), z = as.matrix(as.numeric(gsub(pattern = "C", replacement = "", x = nm2))), col = clust_cols, axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(msa_matrix)) + 0.5, col = "white")
abline(v = (1:nrow(msa_matrix)) + 0.5, col = "white")
temp_anno = pData(tumor_data)[,c("Sample_Name", "TAL1", "TLX1", "TLX3", "HOXA_trans", "HOXA_cis", "HOXA_nd", "Cluster")]
tf_enrich = cluster_enrichment(clust_tbl = temp_anno[,c("Sample_Name", "Cluster")], anno_tbl = temp_anno[,c("Sample_Name", "TAL1", "TLX1", "TLX3", "HOXA_trans", "HOXA_cis", "HOXA_nd")])
## Removed 6 NAs from TAL1
## Removed 6 NAs from TLX1
## Removed 6 NAs from TLX3
## Removed 15 NAs from HOXA_trans
## Removed 15 NAs from HOXA_cis
## Removed 15 NAs from HOXA_nd
tf_enrich = tf_enrich[Feature_lvl %in% 1]
tf_enrich[,Feature_lvl := Feature]
#Results
print(tf_enrich[p_value < 0.01])
## Cluster Feature_lvl n_featurLvl n_featurLvl2 p_value OR_low
## 1: Cluster_C4 HOXA_cis 7/30 0/91 1.089171e-04 5.263243
## 2: Cluster_C1 HOXA_nd 4/7 7/110 7.352185e-03 1.948147
## 3: Cluster_C5 HOXA_trans 16/14 2/96 1.853806e-10 12.773460
## 4: Cluster_C2 TAL1 16/17 1/103 5.815153e-11 15.730688
## 5: Cluster_C4 TLX1 25/13 3/96 7.574931e-15 17.677091
## 6: Cluster_C3 TLX3 14/8 5/110 5.715071e-10 11.418244
## OR_high cluster Feature
## 1: Inf Cluster_C4 HOXA_cis
## 2: Inf Cluster_C1 HOXA_nd
## 3: Inf Cluster_C5 HOXA_trans
## 4: Inf Cluster_C2 TAL1
## 5: Inf Cluster_C4 TLX1
## 6: Inf Cluster_C3 TLX3
tf_enrich = tf_enrich[!Feature_lvl %in% "HOXA_nd" ] #Remove undetermined HOXA enrichments
plotEnrichment(res = tf_enrich, sortByCluster = TRUE, showlegend = FALSE, neg_ylim = 0.5, pthr = 0.01)
maf_anno = data.table::copy(x = pData(tumor_data))
colnames(maf_anno)[1] = 'Tumor_Sample_Barcode'
#Read variants
vc_color_codes = c(
"Mutated" = "#33A02CFF",
"Amplification" = "#1F78B4FF",
'Deletion' = "#6A3D9AFF",
"Multi_Hit" = 'black'
)
#Missing samples: 465 and 616
tall_maf = maftools::read.maf(maf = "data/targetted_NGS/TALL.maf",
clinicalData = maf_anno,
vc_nonSyn = c("Mutated", "Amplification", "Deletion"), removeDuplicatedVariants = FALSE)
## -Reading
## -Validating
## --Non MAF specific values in Variant_Classification column:
## Amplification
## Deletion
## Mutated
## -Summarizing
## -Processing clinical data
## -Finished in 0.063s elapsed (0.063s cpu)
tall_maf
## An object of class MAF
## ID summary Mean Median
## 1: NCBI_Build NA NA NA
## 2: Center NA NA NA
## 3: Samples 143 NA NA
## 4: nGenes 69 NA NA
## 5: Amplification 31 0.217 0
## 6: Deletion 188 1.315 1
## 7: Mutated 541 3.783 3
## 8: total 760 5.315 5
#Gene -> Pathway mappings
pathways = data.table::fread(input = "data/targetted_NGS/Leukemic_pathways.tsv")
path_names = names(table(pathways$Group))
pathways$name = factor(x = pathways$Group, levels = path_names, labels = c("CellCycle", "DNAm", "Epigenomic", "JAKSTAT", "Notch", "Other", "PI3KAKT", "Polycomb", 'RAS', "Transcription"))
path_order = c("Notch", "CellCycle", "PI3KAKT", "RAS", "JAKSTAT", "DNAm", "Polycomb", "Epigenomic", "Transcription", "Other")
gs = getGeneSummary(x = tall_maf)[,.(Hugo_Symbol, AlteredSamples)]
pathways = merge(gs, pathways)
pathways = lapply(split(pathways, as.factor(pathways$name))[path_order], function(x){
x[order(AlteredSamples, decreasing = TRUE)]
})
pathways = data.table::rbindlist(pathways)
maftools::oncoplot(tall_maf, pathways = pathways[,.(Hugo_Symbol, name)], drawRowBar = FALSE, drawColBar = FALSE, clinicalFeatures = "Cluster", annotationColor = list(Cluster = clust_cols), sortByAnnotation=TRUE, colors = vc_color_codes, fontSize=0.4, annotationOrder = c("C1", "C2", "C3", "C4", "C5"), anno_height = 0.5, legend_height = 2, showTitle = FALSE)
cluster_enrich = clinicalEnrichment(maf = tall_maf, clinicalFeature = "Cluster")
## Sample size per factor in Cluster:
##
## C1 C2 C3 C4 C5
## 14 34 22 41 32
#Significant enrichments
cluster_enrich$groupwise_comparision[fdr < 0.05]
## Hugo_Symbol Group1 Group2 n_mutated_group1 n_mutated_group2 p_value
## 1: CDKN2A C5 Rest 7 of 32 92 of 111 2.669157e-10
## 2: SUZ12 C5 Rest 15 of 32 8 of 111 1.218342e-06
## 3: PHF6 C2 Rest 2 of 34 53 of 109 2.016031e-06
## 4: DNMT3A C1 Rest 9 of 14 11 of 129 4.277816e-06
## 5: CDKN2A C4 Rest 39 of 41 60 of 102 6.191327e-06
## 6: CDKN2A C2 Rest 33 of 34 66 of 109 1.249673e-05
## 7: IDH2 C1 Rest 5 of 14 1 of 129 2.446480e-05
## 8: NF1 C5 Rest 10 of 32 4 of 111 4.685076e-05
## 9: NOTCH1 C4 Rest 37 of 41 58 of 102 7.680197e-05
## 10: JAK3 C5 Rest 13 of 32 11 of 111 1.721430e-04
## 11: SUZ12 C4 Rest 0 of 41 23 of 102 2.584708e-04
## 12: PTEN C2 Rest 11 of 34 8 of 109 6.040793e-04
## 13: PHF6 C4 Rest 25 of 41 30 of 102 6.143953e-04
## 14: PHF6 C1 Rest 0 of 14 55 of 129 9.275042e-04
## 15: JAK3 C2 Rest 0 of 34 24 of 109 1.121977e-03
## 16: STAT5B C3 Rest 5 of 22 3 of 121 2.292750e-03
## 17: WT1 C3 Rest 7 of 22 8 of 121 2.298464e-03
## 18: BCL11B C4 Rest 12 of 41 9 of 102 3.396357e-03
## 19: DNMT3A C2 Rest 0 of 34 20 of 109 3.874694e-03
## 20: EZH2 C5 Rest 6 of 32 3 of 111 4.179499e-03
## 21: DNM2 C2 Rest 1 of 34 27 of 109 5.198835e-03
## Hugo_Symbol Group1 Group2 n_mutated_group1 n_mutated_group2 p_value
## OR OR_low OR_high fdr
## 1: 0.05950987 0.018839122 0.1660150 4.270651e-08
## 2: 11.06857487 3.757354339 35.3367322 9.746739e-05
## 3: 0.06701084 0.007407585 0.2847145 1.075217e-04
## 4: 18.50610889 4.660235094 83.8336974 1.711126e-04
## 5: 13.46576850 3.182464883 121.4332414 1.981225e-04
## 6: 21.21639436 3.290439607 890.8298588 3.332461e-04
## 7: 65.64336057 6.446071882 3303.6500748 5.591954e-04
## 8: 11.85071806 3.079422202 56.6029742 9.370152e-04
## 9: 6.93484630 2.247323230 28.8001954 1.365368e-03
## 10: 6.11260917 2.173958129 17.6856582 2.754289e-03
## 11: 0.00000000 0.000000000 0.3600663 3.759576e-03
## 12: 5.93725003 1.932418018 19.1460753 7.561788e-03
## 13: 3.71175929 1.644834256 8.6161540 7.561788e-03
## 14: 0.00000000 0.000000000 0.4306937 1.060005e-02
## 15: 0.00000000 0.000000000 0.4490493 1.196775e-02
## 16: 11.22704508 1.985395740 78.9098421 2.163261e-02
## 17: 6.45691132 1.731091376 23.9120694 2.163261e-02
## 18: 4.22381857 1.468207770 12.6229066 3.018984e-02
## 19: 0.00000000 0.000000000 0.5716395 3.262900e-02
## 20: 8.14068178 1.613490446 53.6327802 3.343599e-02
## 21: 0.09296741 0.002187172 0.6134001 3.961017e-02
## OR OR_low OR_high fdr
writexl::write_xlsx(x = cluster_enrich$groupwise_comparision, path = "Supplementary_table_S1.xlsx", col_names = TRUE, format_headers = TRUE)
clust_enrich_genes = unique(cluster_enrich$groupwise_comparision[fdr < 0.05, Hugo_Symbol])
#Order results based on pathways
clust_enrich_genes = pathways[Hugo_Symbol %in% clust_enrich_genes][order(Group, -AlteredSamples)]
maftools::oncoplot(
maf = tall_maf,
colors = vc_color_codes,
drawColBar = FALSE,
showTitle = FALSE,
drawRowBar = FALSE,
top = 50,
clinicalFeatures = c("Cluster"),
sortByAnnotation = TRUE,
annotationOrder = c("C1", "C2", "C3", 'C4', "C5"),
genes = clust_enrich_genes[, Hugo_Symbol],
annotationColor = list(Cluster = clust_cols),
keepGeneOrder = TRUE, sepwd_genes = 1, sepwd_samples = 1,
legendFontSize = 1, annotationFontSize = 1, removeNonMutated = FALSE, showTumorSampleBarcodes = FALSE)
#Define color codes for variant types
vc_cols = maftools:::get_vcColors()[c(2,4,8,10,13)]
names(vc_cols) = c("coding", "splicing", "stop", "frameshift", "non-frameshift")
vc_cols = c(vc_cols, Multi_Hit = "black", Amp = "red", Del = 'royalblue')
dnmt3a_idh2 = maftools::read.maf(maf = "data/targetted_NGS/DNMT3A_IDH2.maf", vc_nonSyn = names(vc_cols), verbose = FALSE)
lollipopPlot(maf = dnmt3a_idh2, gene = "DNMT3A", colors = vc_cols, refSeqID = "NM_175629", labelPos = 882, collapsePosLabel = TRUE, legendTxtSize = 0.6, axisTextSize = c(0.5, 0.5), domainLabelSize = 0.6, pointSize = 1, titleSize = c(0.8, 0.6))
lollipopPlot(maf = dnmt3a_idh2, gene = "IDH2", colors = vc_cols, refSeqID = "NM_002168", labelPos = 140, collapsePosLabel = TRUE, legendTxtSize = 0.6, axisTextSize = c(0.5, 0.5), domainLabelSize = 0.6, pointSize = 1, titleSize = c(0.8, 0.6))
Identify DMPs per cell-type
normal_anno = pData(normal_data)[, c("Sample_Name", "Cell_Type")]
normal_anno$Cell_Type = factor(
x = normal_anno$Cell_Type,
levels = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
)
design = model.matrix(~0+normal_anno$Cell_Type)
colnames(design) = levels(normal_anno$Cell_Type)
#See here for design matrix details: https://support.bioconductor.org/p/26251/
contrast_matrix =
limma::makeContrasts(CD34 - (ISP+SPo4+SPo8+TCRn+TCRp)/5,
ISP - (CD34+SPo4+SPo8+TCRn+TCRp)/5,
TCRn - (CD34+SPo4+SPo8+ISP+TCRp)/5,
TCRp - (CD34+SPo4+SPo8+ISP+TCRn)/5,
SPo4 - (CD34+TCRp+SPo8+ISP+TCRn)/5,
SPo8 - (CD34+TCRp+SPo4+ISP+TCRn)/5,
levels = design)
fit = limma::lmFit(object = exprs(normal_data), design = design)
fit2 = limma::contrasts.fit(fit = fit, contrasts = contrast_matrix)
fit2 = limma::eBayes(fit = fit2)
normal_ct_res = lapply(1:6, function(i){
x = limma::topTable(fit2,
coef = i,
adjust = "BH",
number = "all")
data.table::setDT(x = x, keep.rownames = TRUE)
})
names(normal_ct_res) = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
#Add CpG annotations to limma results
epic_anno = data.table::as.data.table(fData(normal_data))
normal_ct_res = lapply(normal_ct_res, function(x){
x = merge(x, epic_anno)
x
})
sup_table_s2 = lapply(normal_ct_res, function(x){
x = x[order(adj.P.Val)][adj.P.Val < 0.05 & abs(logFC) > 0.1]
x
})
writexl::write_xlsx(x = sup_table_s2, path = "Supplementary_table_S2.xlsx", col_names = TRUE, format_headers = TRUE)
#Extract all unique DM probes
normal_ct_res_probes = lapply(normal_ct_res, function(x){
x = x[order(adj.P.Val)][adj.P.Val < 0.05 & abs(logFC) > 0.1, rn]
x
})
#Number of DMPs (hyper+hypo) per cell-type
unlist(lapply(normal_ct_res_probes, length))
## CD34 ISP TCRn TCRp SPo4 SPo8
## 13333 5353 1529 1304 10221 12054
Overlap of thymic DMPs
fit2_res_probes = lapply(normal_ct_res, function(x){
x = x[order(adj.P.Val)][adj.P.Val < 0.05 & abs(logFC) > 0.1, rn]
x
})
olaps = ComplexHeatmap::make_comb_mat(fit2_res_probes)
ComplexHeatmap::UpSet(m = olaps, pt_size = unit(1.5, "mm"), lwd = 1, row_names_gp = gpar(fontsize = 8), set_order = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8"))
normal_ct_res_probes_uniq = unique(unlist(normal_ct_res_probes))
print(length(normal_ct_res_probes_uniq))
## [1] 23069
normal_thymic_specific_dat = order_by_sds(normal_data[normal_ct_res_probes_uniq,])
print(dim(normal_thymic_specific_dat))
## Features Samples
## 23069 12
pcaMset(normal_thymic_specific_dat, phenoCol="Cell_Type", phenoColors=color_codes$Cell_Type[1:6], legendPos = "bottom", legendNcol = 1)
#Background ensemble IDs (all unique DMPs)
bg_ens_ids = unique(epic_anno[rn %in% normal_ct_res_probes_uniq, Nearest_Ens])
bg_ens_ids = bg_ens_ids[!bg_ens_ids %in% ""]
#print(length(bg_ens_ids))
#Run GO for each celltype DMPs
normal_ct_go = lapply(normal_ct_res, function(x){
xens = unique(x[adj.P.Val < 0.05 & abs(logFC) > 0.1, Nearest_Ens])
xens = xens[!xens %in% ""]
xgo = goseq_wrapper(assayedGenes = bg_ens_ids, deGenes = xens)
xgo[, log_pvalue := -log10(over_represented_pvalue)]
xgo
})
layout(mat = matrix(1:6, nrow = 2, ncol = 3, byrow = TRUE), widths = c(2, 2))
par(mar = c(2, 1, 1, 1))
for(i in 1:6){
x = normal_ct_go[[i]][1:10]
x_lims = ceiling(max(x[,log_pvalue]))
plot(NA, xlim = c(0, x_lims), ylim = c(0, 10), axes = FALSE, xlab = NA, ylab = NA)
#abline(v = seq(0, x_lims, by = 1), col = 'gray90', lty = 2)
for(j in 1:nrow(x)){
rect(
xleft = x[j, log_pvalue],
ybottom = j - 1,
xright = 0,
ytop = j,
col = grDevices::adjustcolor(col = "black", alpha.f = 0.2),
border = "white"
)
}
axis(side = 1, at = pretty(c(0, x_lims)))
abline(v = 2, lty = 2, col = 'maroon')
title(main = names(normal_ct_go)[i], adj = 0, font.main = 4)
text(x = rep(0, nrow(x)), y = seq(0.5, nrow(x)-0.5, 1), labels = x[, term], adj = 0, xpd = TRUE, col = 'black', font = 1)
}
Might look slightly different from the publication due to seed values
#Based on visual inspection of heatmap `k = 8`
k_centers = 8
set.seed(1024)
km8 = kmeans(x = normal_thymic_specific_dat, centers = k_centers)
km8_clust_assn = as.data.frame(km8$cluster)
colnames(km8_clust_assn)[1] = 'cluster'
km8_clust_assn$cluster = paste0("K", km8_clust_assn$cluster)
km8_clust_assn$cluster = factor(km8_clust_assn$cluster, levels = paste0("K", 1:k_centers))
thymic_ct_cols = RColorBrewer::brewer.pal(n = 6, name = "Dark2")
names(thymic_ct_cols) = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
#Draw heatmap
col_fun = circlize::colorRamp2(breaks = seq(0, 0.99, 0.01),
colors = grDevices::colorRampPalette(rev(
RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")
))(100))
col_anno = ComplexHeatmap::HeatmapAnnotation(df = normal_anno[, c("Cell_Type"), drop = FALSE], col = list(sample_type = thymic_ct_cols))
ComplexHeatmap::Heatmap(
matrix = as.matrix(normal_thymic_specific_dat),
row_split = km8_clust_assn,
show_row_names = FALSE,
top_annotation = col_anno,
col = col_fun,
column_gap = grid::unit(10, "mm"),
border = TRUE, show_column_names = FALSE
)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
#Avg meth (by replicates)
normal_thymic_specific_dat_avg = lapply(seq(1, 12, 2), function(i){
rowMeans(exprs(normal_thymic_specific_dat[,c(i, i+1)]))
})
normal_thymic_specific_dat_avg = as.data.frame(normal_thymic_specific_dat_avg)
colnames(normal_thymic_specific_dat_avg) = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
#Draw a line group per cluster
normal_thymic_specific_dat_avg = normal_thymic_specific_dat_avg[rownames(km8_clust_assn),]
normal_thymic_specific_dat_avg$Cluster = km8_clust_assn$cluster
data.table::setDT(x = normal_thymic_specific_dat_avg, keep.rownames = TRUE)
normal_thymic_specific_dat_avg = data.table::melt(data = normal_thymic_specific_dat_avg)
## Warning in melt.data.table(data = normal_thymic_specific_dat_avg): id.vars
## and measure.vars are internally guessed when both are 'NULL'. All non-numeric/
## integer/logical type columns are considered id.vars, which in this case are
## columns [rn, Cluster, ...]. Consider providing at least one of 'id' or 'measure'
## vars in future.
normal_thymic_specific_dat_avg = normal_thymic_specific_dat_avg[,mean(value), .(Cluster, variable)]
normal_thymic_specific_dat_avg = data.table::dcast(data = normal_thymic_specific_dat_avg, variable ~ Cluster, value.var = 'V1')
data.table::setDF(x = normal_thymic_specific_dat_avg, rownames = as.character(normal_thymic_specific_dat_avg$variable))
normal_thymic_specific_dat_avg = normal_thymic_specific_dat_avg[,-1]
normal_thymic_specific_dat_avg = t(normal_thymic_specific_dat_avg)
temp_col_pal = RColorBrewer::brewer.pal(n = 8, name = 'Dark2')
#Draw a line plot for dynamics of methylation per `k`
par(mar = c(2, 2, 1, 1), oma = c(0, 0, 0, 1))
plot(NA, xlim = c(1, 6), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)
abline(h = seq(0, 1, 0.2), v = 1:6, col = 'gray70', lty = 2)
for(i in 1:nrow(normal_thymic_specific_dat_avg)){
ri = normal_thymic_specific_dat_avg[i,]
points(1:6, y = ri, type = 'l', col = temp_col_pal[i])
points(1:6, y = ri, pch = 21, bg = temp_col_pal[i], col = 'black', cex = 0.6)
}
axis(side = 1, at = 1:6, labels = colnames(normal_thymic_specific_dat_avg), cex.axis = 0.6)
axis(side = 2, at = seq(0, 1, 0.2), las = 2, cex.axis = 0.8)
par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE)
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n")
legend(x = "topright", legend = rownames(normal_thymic_specific_dat_avg), col = temp_col_pal, lty = 1, xpd = TRUE, cex = 0.6)
#In manuscript K = tDMP
tDMPdata.table::setDT(x = km8_clust_assn, keep.rownames = TRUE)
k_go_res = lapply(levels(km8_clust_assn$cluster), function(k){
k_ens = unique(epic_anno[rn %in% km8_clust_assn[cluster %in% k, rn], Nearest_Ens])
k_ens = k_ens[!k_ens %in% ""]
k_go = goseq_wrapper(assayedGenes = bg_ens_ids, deGenes = k_ens)
k_go[, log_pvalue := -log10(over_represented_pvalue)]
k_go
})
names(k_go_res) = levels(km8_clust_assn$cluster)
#Sup-table 3
sup_table_s3 = lapply(X = split(km8_clust_assn, as.factor(km8_clust_assn$cluster)), FUN = function(x){
merge(x, epic_anno, by = 'rn')
})
writexl::write_xlsx(x = sup_table_s3, path = "Supplementary_table_S3.xlsx", col_names = TRUE, format_headers = TRUE)
layout(mat = matrix(1:8, nrow = 2, ncol = 4, byrow = TRUE))
par(mar = c(2, 1, 1, 1))
for(i in 1:8){
x = k_go_res[[i]][1:10]
x_lims = ceiling(max(x[,log_pvalue]))
plot(NA, xlim = c(0, x_lims), ylim = c(0, 10), axes = FALSE, xlab = NA, ylab = NA)
#abline(v = seq(0, x_lims, by = 1), col = 'gray90', lty = 2)
for(j in 1:nrow(x)){
rect(
xleft = x[j, log_pvalue],
ybottom = j - 1,
xright = 0,
ytop = j,
col = grDevices::adjustcolor(col = "black", alpha.f = 0.2),
border = "white"
)
}
axis(side = 1, at = pretty(c(0, x_lims)))
abline(v = 2, lty = 2, col = 'maroon')
title(main = names(k_go_res)[i], adj = 0, font.main = 4)
text(x = rep(0, nrow(x)), y = seq(0.5, nrow(x)-0.5, 1), labels = x[, term], adj = 0, xpd = TRUE, col = 'black', font = 2)
}
#In manuscript K = tDMP
normal_ct_phyl_data = normal_data[normal_ct_res_probes_uniq,]
normal_ct_phyl_dist = stats::dist(t(exprs(normal_ct_phyl_data)), method = "man")
traj_ip_normal_dist_nj = ape::nj(X = normal_ct_phyl_dist)
normal_anno$Cell_Type_colors = color_codes$Cell_Type[as.character(normal_anno[colnames(normal_ct_phyl_data), "Cell_Type"])]
par(mar = c(0, 1, 0, 1))
plot_tree(tree = traj_ip_normal_dist_nj, col_df = normal_anno[,c("Cell_Type_colors"), drop = FALSE], col_id = 1, type = 'unrooted', rotate = 90, cex = 0.4)
#legend(x = "bottom", legend = names(color_codes$Cell_Type[1:6]), col = color_codes$Cell_Type[1:6], ncol = 3, cex = 0.6)
Phylogenetic trees with random set of probes
#Random set-1
set.seed(seed = 1024)
rand_probes_1 = sample(x = rownames(normal_data)[!rownames(normal_data) %in% normal_ct_res_probes_uniq], size = length(normal_ct_res_probes_uniq), replace = FALSE)
normal_ct_phyl_data_rand1 = normal_data[rand_probes_1,]
dim(normal_ct_phyl_data_rand1)
## Features Samples
## 23069 12
normal_ct_phyl_data_rand1_dist = stats::dist(t(exprs(normal_ct_phyl_data_rand1)), method = "man")
normal_ct_phyl_data_rand1_nj = ape::nj(X = normal_ct_phyl_data_rand1_dist)
#Random set-2
set.seed(seed = 1024)
rand_probes_2 = sample(x = rownames(normal_data)[!rownames(normal_data) %in% normal_ct_res_probes_uniq], size = length(normal_ct_res_probes_uniq)*2, replace = FALSE)
normal_ct_phyl_data_rand2 = normal_data[rand_probes_2,]
dim(normal_ct_phyl_data_rand2)
## Features Samples
## 46138 12
normal_ct_phyl_data_rand2_dist = stats::dist(t(exprs(normal_ct_phyl_data_rand2)), method = "man")
normal_ct_phyl_data_rand2_nj = ape::nj(X = normal_ct_phyl_data_rand2_dist)
par(mar = c(0, 0, 0, 0), mfrow = c(1, 2))
plot_tree(tree = normal_ct_phyl_data_rand1_nj, col_df = normal_anno[,c("Cell_Type_colors"), drop = FALSE], col_id = 1, type = 'unrooted', cex = 0.4)
plot_tree(tree = normal_ct_phyl_data_rand2_nj, col_df = normal_anno[,c("Cell_Type_colors"), drop = FALSE], col_id = 1, type = 'unrooted', rotate = 270, cex = 0.4)
non_thymic_probes = rownames(normal_data)[!rownames(normal_data) %in% normal_ct_res_probes_uniq]
non_thymic_probes = intersect(non_thymic_probes, rownames(normal_data))
non_thymic_probes = intersect(non_thymic_probes, rownames(tumor_data))
complete_data_minus_t_cell_probes = Biobase::combine(normal_data[non_thymic_probes,], tumor_data[non_thymic_probes,])
complete_data_minus_t_cell_probes
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 756324 features, 155 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: sam_CD34_1 sam_CD34_2 ... sample_RAD_MI (155 total)
## varLabels: Sample_Name Age ... Maturation_Arrest_Stage_simple (53
## total)
## varMetadata: labelDescription
## featureData
## featureNames: cg26928153 cg16269199 ... cg02995750 (756324 total)
## fvarLabels: rn Chr ... Relation_to_Island (12 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
com_probes = intersect(rownames(normal_data), rownames(tumor_data))
complete_data_plus_t_cell_probes = cbind(exprs(normal_data[com_probes,]), exprs(tumor_data[com_probes,]))
complete_data_plus_t_cell_probes = Biobase::ExpressionSet(assayData = as.matrix(complete_data_plus_t_cell_probes))
complete_data_plus_t_cell_probes$Cell_Type = complete_data_minus_t_cell_probes$Cell_Type
complete_data_plus_t_cell_probes
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 779393 features, 155 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: sam_CD34_1 sam_CD34_2 ... sample_RAD_MI (155 total)
## varLabels: Cell_Type
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
par(mfrow = c(1, 2))
pcaMset(complete_data_minus_t_cell_probes, phenoCol="Cell_Type", phenoColors = color_codes$Cell_Type, legendNcol = 2)
pcaMset(complete_data_plus_t_cell_probes, phenoCol="Cell_Type", phenoColors = color_codes$Cell_Type, legendNcol = 2)
rm(complete_data_minus_t_cell_probes, complete_data_plus_t_cell_probes)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 10749205 574.1 30246528 1615.4 46783552 2498.6
## Vcells 296853472 2264.9 1110396454 8471.7 1387691599 10587.3
complete_data_all = Biobase::combine(tumor_data[rownames(normal_ct_phyl_data),], normal_ct_phyl_data)
complete_data_all$Cell_Type2 = c(as.character(complete_data_all$Cluster[1:143]), complete_data_all$Cell_Type[144:155])
print(dim(complete_data_all))
## Features Samples
## 23069 155
complete_data_all_pca = prcomp(t(exprs(complete_data_all)))
complete_data_all_pca_dat = as.data.frame(complete_data_all_pca$x[,c("PC1", "PC2", "PC3", "PC4"), drop= FALSE])
var_explained_all = complete_data_all_pca$sdev^2/sum(complete_data_all_pca$sdev^2)
complete_data_all_pca_dat = cbind(pData(complete_data_all)[rownames(complete_data_all_pca_dat),], complete_data_all_pca_dat)
complete_data_all_pca_dat$color = c(clust_cols, color_codes$Cell_Type)[complete_data_all_pca_dat$Cell_Type2]
temp_nt = complete_data_all_pca_dat[complete_data_all_pca_dat$Tissue == "Normal_Thymus", ]
temp_t = complete_data_all_pca_dat[complete_data_all_pca_dat$Tissue != "Normal_Thymus", ]
temp_t$color = grDevices::adjustcolor(col = temp_t$color, alpha.f = 0.4)
par(mar = c(2, 2, 1, 1))
plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(pretty(complete_data_all_pca_dat$PC1)), ylim = range(pretty(complete_data_all_pca_dat$PC2)))
abline(h = pretty(complete_data_all_pca_dat$PC2), v = pretty(complete_data_all_pca_dat$PC1), col = grid_cols, lty = 2)
points(x = temp_nt$PC1, y = temp_nt$PC2, bg = temp_nt$color, pch = 23, cex = 1)
points(x = temp_t$PC1, y = temp_t$PC2, col = temp_t$color, pch = 19, cex = 0.6)
axis(side = 1, at = pretty(complete_data_all_pca_dat$PC1), lwd = 1, font = 1, cex.axis = 0.6)
mtext(text = paste0("PC2 [", round(var_explained_all[2], digits = 2), "]"), side = 2, cex = 0.6)
mtext(text = paste0("PC1 [", round(var_explained_all[1], digits = 2), "]"), side = 1, cex = 0.6)
axis(side = 2, at = pretty(complete_data_all_pca_dat$PC2), lwd = 1, font = 1, las = 2, cex.axis = 0.6)
legend(x = "topright", legend = names(clust_cols)[1:5], col = grDevices::adjustcolor(col = clust_cols[1:5], alpha.f = 0.4), pch = 19, pt.cex = 0.6, cex = 0.6)
legend(x = "topleft", legend = names(color_codes$Cell_Type[1:6]), pt.bg = color_codes$Cell_Type[1:6], pch = 23, pt.cex = 0.6, cex = 0.6)
normal_ct_phyl_data = order_by_sds(mat = normal_ct_phyl_data)
msa_dat = exprs(complete_data_all[rownames(normal_ct_phyl_data)[1:2500],])
dim(msa_dat)
## [1] 2500 155
traj_ip_dist = stats::dist(t(msa_dat), method = "man")
traj_ip_dist_nj = ape::nj(X = traj_ip_dist)
#Color codes
complete_data_all_pca_dat$tree_cols = ifelse(test = complete_data_all_pca_dat$Tissue == "Normal_Thymus", yes = "black", no = complete_data_all_pca_dat$color)
par(mar = c(0, 0, 0, 0))
plot_tree(
tree = traj_ip_dist_nj,
col_df = complete_data_all_pca_dat,
col_id = which(x = colnames(complete_data_all_pca_dat) == "tree_cols"),
type = 'un',
cex = 0.1,
point_size = 0.6,
edge.width = 0.6,
rotate.tree = 50
)
msa_dat = as.data.frame(msa_dat)
data.table::setDT(x = msa_dat, keep.rownames = TRUE)
msa_dat_mlt = data.table::melt(data = msa_dat)
## Warning in melt.data.table(data = msa_dat): id.vars and measure.vars are
## internally guessed when both are 'NULL'. All non-numeric/integer/logical type
## columns are considered id.vars, which in this case are columns [rn, ...].
## Consider providing at least one of 'id' or 'measure' vars in future.
#data.table::setDT(x = tree_anno, keep.rownames = TRUE)
msa_dat_mlt = merge(msa_dat_mlt, complete_data_all_pca_dat[,c("Sample_Name", "Cell_Type2", "Cluster")], by.y = 'Sample_Name', by.x = 'variable')
msa_dat_mlt$Cluster = ifelse(test = is.na(as.character(msa_dat_mlt$Cluster)), yes = "Normal", no = as.character(msa_dat_mlt$Cluster))
msa_dat_mlt_normal = msa_dat_mlt[Cluster %in% "Normal"]
msa_dat_mlt_tumor = msa_dat_mlt[!Cluster %in% "Normal"]
msa_dat_mlt_tumor_avg = msa_dat_mlt_tumor[,mean(value), .(Cluster, rn)]
msa_dat_mlt_tumor_avg = data.table::dcast(data = msa_dat_mlt_tumor_avg, rn ~ Cluster, value.var = 'V1')
msa_dat_mlt_normal = data.table::dcast(data = msa_dat_mlt_normal, rn ~ variable, value.var = 'value')
msa_dat_tum_normal = merge(msa_dat_mlt_tumor_avg, msa_dat_mlt_normal)
data.table::setDF(x = msa_dat_tum_normal, rownames = msa_dat_tum_normal$rn)
msa_dat_tum_normal = msa_dat_tum_normal[,-1]
temp_coldf = data.frame(
row.names = colnames(msa_dat_tum_normal),
samp_type = gsub(
pattern = "sam_|_1|_2",
replacement = "",
x = colnames(msa_dat_tum_normal)
),
stringsAsFactors = FALSE
)
temp_coldf$color = c(color_codes$Cluster[1:5], color_codes$Cell_Type[1:6])[temp_coldf$samp_type]
temp_coldf$pch = ifelse(test = temp_coldf$samp_type %in% c("C1", "C2", "C3", "C4", "C5"), yes = 19, no = 23)
#NJ
msa_dat_tum_normal_dist = stats::dist(t(msa_dat_tum_normal), method = "man")
msa_dat_tum_normal_nj = ape::nj(X = msa_dat_tum_normal_dist)
#layout(mat = matrix(c(1, 2), nrow = 2, byrow = TRUE), heights = c(3, 1))
par(mar = c(1, 1, 1, 1))
plot_tree(tree = msa_dat_tum_normal_nj, col_df = temp_coldf, col_id = 2, type = 'un', cex = 0.6, point_size = 1, rotate.tree = 70, pch_id = 3)
RNA-seq data for normal thymic cell types are obtained from published dataset GSE151079. Data analysis was done using similar cell types for which DNAm EPIC arrays were generated.
normal_rna = data.table::fread(input = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE151nnn/GSE151079/suppl/GSE151079_RNAseq_thymocytes_raw_counts.txt.gz")
colnames(normal_rna) = gsub(pattern = " R1", replacement = "_R1", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = " R2", replacement = "_R2", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = "\\+", replacement = "p", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = "\\-", replacement = "n", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = " ", replacement = "_", x = colnames(normal_rna))
#normal_rna[,.N,biotype]
normal_rna = normal_rna[biotype %in% "protein_coding"]
coldata = data.frame(row.names = colnames(normal_rna)[2:23], Sample_Name = colnames(normal_rna)[2:23], Cell_type = gsub(pattern = "_R1|_R2", replacement = "", x = colnames(normal_rna)[2:23]))
gene_anno = normal_rna[,.(V1, seqname, seqstart, seqend, biotype, seqstrand, symbol)]
To compare with the DNAm we should use similar cell types:
coldata = coldata[coldata$Cell_type %in% c("CD34pCD1n", "ISP_CD28p", "DP_CD3n", "DP_CD3p", "SP_CD4p", 'SP_CD8p'),]
coldata = coldata[order(coldata$Cell_type),]
data.table::setDF(x = normal_rna, rownames = normal_rna$V1)
normal_rna = normal_rna[,rownames(coldata)]
if(all(rownames(coldata) != colnames(normal_rna))){
stop("Order mismatch!")
}
#gene_anno = data.frame(row.names = gene_anno$V1, ens_id = gene_anno$V1, Hugo_Symbol = gene_anno$symbol)
data.table::setDF(x = gene_anno, rownames = gene_anno$V1)
gene_anno = gene_anno[rownames(normal_rna),]
gene_anno = gene_anno[!duplicated(gene_anno$symbol),]
gene_anno = gene_anno[complete.cases(gene_anno),]
normal_rna = normal_rna[rownames(gene_anno),]
if(!all(rownames(gene_anno) == rownames(normal_rna))){
stop("Order mismatch!")
}
rownames(normal_rna) = gene_anno$symbol
#Done once and results are stored for re-use
dds = DESeq2::DESeqDataSetFromMatrix(countData = normal_rna, colData = coldata, design = ~Cell_type)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq2::DESeq(object = dds, parallel = TRUE)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
#saveRDS(object = dds, file = paste0(res_dir, "DDS_PC_genes_normalized.RDs"), version = 2)
vsd = DESeq2::vst(object = dds, blind = FALSE)
#saveRDS(object = vsd, file = paste0(res_dir, "VSD_PC_genes.RDs"), version = 2)
vsd_log_mat = order_by_sds(mat = as.data.frame(assay(vsd)))
ct_cols = color_codes$Cell_Type[1:6]
names(ct_cols) = c("CD34pCD1n", "ISP_CD28p", "DP_CD3n", "DP_CD3p", "SP_CD4p", "SP_CD8p")
traj_ip_dist = stats::dist(t(vsd_log_mat[1:1000,]), method = "man")
traj_ip_dist_nj = ape::nj(X = traj_ip_dist)
coldata$color = ct_cols[coldata$Cell_type]
par(mar = c(0, 0, 0, 0))
par(mar = c(0, 0, 0, 0))
plot_tree(tree = traj_ip_dist_nj, col_df = coldata, col_id = which(x = colnames(coldata) == "color"), type = 'un',
cex = 0.1, point_size = 1.2, edge.width = 0.6, rotate.tree = 130)
legend(x = "topright", legend = names(ct_cols), col = ct_cols, pch = 19, bty = "n", ncol = 2, cex = 0.7)
thymic_ct_temp = names(ct_cols)[2:6]
thymic_results = lapply(thymic_ct_temp, function(ct){
print(paste0("Comparing: ", ct, " vs. CD34pCD1n"))
cd_temp = coldata[coldata$Cell_type %in% c("CD34pCD1n", ct),, drop = FALSE]
counts_temp = normal_rna[,rownames(cd_temp)]
dds_ct = DESeq2::DESeqDataSetFromMatrix(countData = counts_temp, colData = cd_temp, design = ~ Cell_type)
dds_ct = DESeq2::DESeq(object = dds_ct, parallel = TRUE)
rds = DESeq2::results(object = dds_ct, contrast = c("Cell_type", ct, "CD34pCD1n"), name = ct, lfcThreshold = 0.6)
rds = data.table::as.data.table((as.data.frame(rds)), keep.rownames = TRUE)
rds[order(padj)]
})
## [1] "Comparing: ISP_CD28p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: DP_CD3n vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: DP_CD3p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: SP_CD4p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: SP_CD8p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
names(thymic_results) = thymic_ct_temp
thymic_deg_table = data.frame(
down = sapply(thymic_results, function(x)
nrow(x[padj < 0.1][log2FoldChange < 0])),
up = sapply(thymic_results, function(x)
nrow(x[padj < 0.1][log2FoldChange > 0]))
)
print(thymic_deg_table)
## down up
## ISP_CD28p 751 494
## DP_CD3n 1417 1053
## DP_CD3p 1548 1398
## SP_CD4p 1558 1713
## SP_CD8p 1322 1488
thymic_degs = unique(as.character(unlist(lapply(thymic_results, function(x) x[padj < 0.1, rn]))))
tss_probes = data.table::fread(input = "data/extdata/EPIC_TSS_probes.bed") #EPIC probes with 1.5kb of TSS
temp_normal_meth = as.data.table(exprs(normal_data), keep.rownames = TRUE)
temp_normal_meth = merge(tss_probes[,.(V4, V9)], temp_normal_meth, by.x = 'V9', by.y = 'rn') #Change V4 -> V5 for avg. methylation per tx
temp_normal_meth[,V9 := NULL]
normal_pm = temp_normal_meth[,lapply(.SD, mean, na.rm = TRUE), by = V4]
data.table::setDF(x = normal_pm, rownames = normal_pm$V4)
normal_pm$V4 = NULL
normal_pm = normal_pm[,c("sam_CD34_1", "sam_CD34_2", "sam_ISP_1", "sam_ISP_2", "sam_TCRn_1", "sam_TCRn_2", "sam_TCRp_1", "sam_TCRp_2", "sam_SPo4_1", "sam_SPo4_2", "sam_SPo8_1", "sam_SPo8_2")]
rm(temp_normal_meth, tss_probes)
Identify DM Promoters per cell-type
normal_anno = data.frame(row.names = colnames(normal_pm), Cell_Type = rep(c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8"), each = 2))
normal_anno$Cell_Type = factor(
x = normal_anno$Cell_Type,
levels = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
)
design = model.matrix(~0+normal_anno$Cell_Type)
colnames(design) = levels(normal_anno$Cell_Type)
fit = limma::lmFit(object = normal_pm, design = design)
contrast_matrix =
limma::makeContrasts(ISP - CD34, TCRn - CD34, TCRp - CD34, SPo4 - CD34, SPo8 - CD34,
levels = design)
fit2 = limma::contrasts.fit(fit = fit, contrasts = contrast_matrix)
fit2 = limma::eBayes(fit = fit2)
normal_ct_res = lapply(1:5, function(i){
x = limma::topTable(fit2,
coef = i,
adjust = "BH",
number = "all")
data.table::setDT(x = x, keep.rownames = TRUE)
})
names(normal_ct_res) = c("ISP", "TCRn", "TCRp", "SPo4", "SPo8")
thymic_dmp_table = data.frame(
Hypo = sapply(normal_ct_res, function(x)
nrow(x[P.Value < 0.01][logFC < 0])),
Hyper = sapply(normal_ct_res, function(x)
nrow(x[P.Value < 0.01][logFC > 0]))
)
print(thymic_dmp_table)
## Hypo Hyper
## ISP 184 127
## TCRn 531 289
## TCRp 829 286
## SPo4 766 1446
## SPo8 686 1802
thymic_dmps = unique(as.character(unlist(lapply(normal_ct_res, function(x) x[P.Value < 0.01, rn]))))
Barplot of DEG and DMP
thymic_deg_table = thymic_deg_table[c("ISP_CD28p", "DP_CD3n", "DP_CD3p", "SP_CD4p", "SP_CD8p"),]
par(mar = c(2, 2, 3, 2))
b = barplot(t(thymic_deg_table), horiz = TRUE, xlim = c(-2500, 3300), col = c("#807DBA", "#F16913"), names.arg = rep(NA, 5), axes = FALSE)
barplot(-t(thymic_dmp_table), horiz = TRUE, add = TRUE, col = c("#807DBA", "#F16913"), names.arg = rep(NA, 5), axes = FALSE)
legend(x = -2000, y = 8, legend = c("Hypo", "Hyper"), col = c("#807DBA", "#F16913"), pch = 15, xpd = TRUE, bty = "n", title = "Methylation", cex = 0.8, ncol = 2)
legend(x = 2000, y = 8, legend = c("Down", "Up"), col = c("#807DBA", "#F16913"), pch = 15, xpd = TRUE, bty = "n", title = "Expression", cex = 0.8, ncol = 2)
axis(side = 1, at = seq(-3000, 3000, 500), labels = abs(seq(-3000, 3000, 500)), cex.axis = 0.8)
mtext(text = c("ISP", "TCRn", "TCRp", "SP4", "SP8"), side = 2, at = b, las =2, cex = 0.8)
getAvgMeth = function(m, g, scale = TRUE){
mg = sapply(g, function(gi){
sapply(seq(1, 12, 2), function(i){
mean(m[gi,][c(i, i+1)])
})
})
if(scale){
mg = apply(mg, 2, scale)
}
rownames(mg) = c("CD34", "ISP", "TCRn", "TCRp", "SP4", "SP8")
mg
}
plotLineMeth = function(x){
#par(mfrow = c(1, ncol(x)), mar = c(4, 3, 1, 1))
par(mar = c(4, 3, 1, 1))
for(i in 1:ncol(x)){
plot(x[,i], type = 'l', axes = FALSE, ylim = range(pretty(x[,i])), xlab = NA)
axis(side = 1, at = 1:nrow(x), labels = rownames(x), las = 2)
axis(side = 2, at = pretty(x[,i]), labels = pretty(x[,i]), line = 0.5, las = 2)
points(x[,i])
title(colnames(x)[i])
}
}
#Order according to dev. stages
vsd_log_mat = vsd_log_mat[,c("CD34pCD1n_R1", "CD34pCD1n_R2", "ISP_CD28p_R1", "ISP_CD28p_R2", "DP_CD3n_R1", "DP_CD3n_R2", "DP_CD3p_R1", "DP_CD3p_R2", "SP_CD4p_R1", "SP_CD4p_R2", "SP_CD8p_R1", "SP_CD8p_R2")]
com_genes = intersect(rownames(normal_pm), rownames(normal_rna))
length(com_genes)
## [1] 17324
Draw DNAm and expression dynamics for few known marker genes
mark_meth = getAvgMeth(m = as.matrix(normal_pm), g = c("CD34", "SPI1", "NOTCH1", "TCF7", "CD27", "CD8A"), scale = TRUE)
mark_exp = getAvgMeth(m = as.matrix(vsd_log_mat), g = c("CD34", "SPI1", "NOTCH1", "TCF7", "CD27", "CD8A"), scale = TRUE)
par(mfrow = c(2, ncol(mark_meth)))
plotLineMeth(x = mark_exp)
plotLineMeth(x = mark_meth)
Genes with both DM promoter and DM expressions
isp_com_dg = intersect(thymic_results$ISP_CD28p[padj < 0.1, rn], normal_ct_res$ISP[P.Value < 0.01, rn])
tcrn_com_dg = intersect(thymic_results$DP_CD3n[padj < 0.1, rn], normal_ct_res$TCRn[P.Value < 0.01, rn])
tcrp_com_dg = intersect(thymic_results$DP_CD3p[padj < 0.1, rn], normal_ct_res$TCRp[P.Value < 0.01, rn])
sp4_com_dg = intersect(thymic_results$SP_CD4p[padj < 0.1, rn], normal_ct_res$SPo4[P.Value < 0.01, rn])
sp8_com_dg = intersect(thymic_results$SP_CD8p[padj < 0.1, rn], normal_ct_res$SPo8[P.Value < 0.01, rn])
lapply(list(isp_com_dg, tcrn_com_dg, tcrp_com_dg, sp4_com_dg, sp8_com_dg), length)
## [[1]]
## [1] 44
##
## [[2]]
## [1] 145
##
## [[3]]
## [1] 235
##
## [[4]]
## [1] 588
##
## [[5]]
## [1] 626
udgs = sort(unique(c(isp_com_dg, tcrn_com_dg, tcrp_com_dg, sp4_com_dg, sp8_com_dg)))
length(udgs)
## [1] 942
writexl::write_xlsx(x = data.table::data.table(Gene = udgs), path = "Supplementary_table_S4.xlsx", col_names = TRUE, format_headers = TRUE)
udgs_m = t(getAvgMeth(m = as.matrix(normal_pm), g = udgs, scale = FALSE))
udgs_e = t(getAvgMeth(m = as.matrix(vsd_log_mat), g = udgs, scale = FALSE))
par(mar = c(4, 4, 1, 1))
plot(NA, xlim = c(1,6), ylim = c(-1, 0.1), axes = FALSE, xlab = NA, ylab = NA)
axis(side = 1, at = 1:6, labels = colnames(udgs_m), las = 2)
axis(side = 2, at = seq(-1, 0, 0.2), las = 2)
abline(h = seq(-1, 0, 0.2), v = 1:5, lty = 2, col = 'gray80')
for(i in 1:6){
xcor = cor.test(udgs_m[,i ], udgs_e[,i])
points(x = i , y = xcor$estimate, pch = 19)
segments(x0 = i, y0 = xcor$conf.int[1], x1 = i, y1 = xcor$conf.int[2], lwd = 1)
text(x = i, y = 0.1, labels = ifelse(xcor$p.value < 0.001, yes = "***", no = "ns"), xpd = TRUE)
}
mtext(text = "Pearson correlation", side = 2, line = 2.5)
GO for common promoter methylated and Differentially expressed gene
udgs_go = goseq_wrapper(assayedGenes = rownames(vsd_log_mat), deGenes = udgs, source_id = "geneSymbol")
par(mar = c(3, 1, 1, 2))
x = udgs_go[1:20]
x[, log_pvalue := -log10(over_represented_pvalue)]
x_lims = ceiling(max(x[, log_pvalue]))
plot(
NA,
xlim = c(0, x_lims),
ylim = c(0, nrow(x)),
axes = FALSE,
xlab = NA,
ylab = NA
)
#abline(v = seq(0, x_lims, by = 1), col = 'gray90', lty = 2)
for (j in 1:nrow(x)) {
rect(
xleft = x[j, log_pvalue],
ybottom = j - 1,
xright = 0,
ytop = j,
col = grDevices::adjustcolor(col = "black", alpha.f = 0.2),
border = "white"
)
}
axis(side = 1, at = pretty(c(0, x_lims)))
abline(v = 2, lty = 2, col = 'maroon')
#title(main = names(down_go)[i], adj = 0, font.main = 4)
text(
x = rep(0, nrow(x)),
y = seq(0.5, nrow(x) - 0.5, 1),
labels = x[, term],
adj = 0,
xpd = TRUE,
col = 'black',
font = 1, cex = 0.7
)
dnam_ct = c("CD34", "ISP", "TCRn", "TCRp", "SP4", "SP8")
dnam_ct_cols = ct_cols
names(dnam_ct_cols) = dnam_ct
x = pheatmap::pheatmap(mat = normal_pm[udgs, ], scale = "row", show_rownames = FALSE, width = 4, height = 5, cluster_cols = FALSE, annotation = data.frame(row.names = colnames(normal_pm), Cell_type = rep(dnam_ct, each = 2)), annotation_colors = list(Cell_type = dnam_ct_cols), show_colnames = FALSE, annotation_legend = TRUE)
pheatmap::pheatmap(vsd_log_mat[rownames(normal_pm[udgs, ][x$tree_row$order,]), ], cluster_rows = FALSE, scale = 'row', show_rownames = FALSE, width = 3, height = 5, cluster_cols = FALSE, annotation = coldata[,c("Cell_type"), drop = FALSE], annotation_colors = list(Cell_type = ct_cols), show_colnames = FALSE, annotation_legend = FALSE)
com_probes = intersect(rownames(normal_data), rownames(tumor_data))
meth_exprs = Biobase::combine(tumor_data[com_probes,], normal_data[com_probes,])
dim(meth_exprs)
## Features Samples
## 779393 155
#Remove Thymic DMPs from analysis
meth_exprs = meth_exprs[rownames(meth_exprs)[!rownames(meth_exprs) %in% normal_ct_res_probes_uniq], ]
print(dim(meth_exprs))
## Features Samples
## 756324 155
meth_exprs$Cluster = ifelse(is.na(meth_exprs$Cluster), "Normal", as.character(meth_exprs$Cluster))
cell_type <- as.factor(meth_exprs$Cluster)
design <- model.matrix(~0+cell_type)
colnames(design) <- levels(cell_type)
fit <- limma::lmFit(object = exprs(meth_exprs), design = design)
contrast_matrix <- limma::makeContrasts(C1-Normal, C2-Normal, C3-Normal, C4-Normal, C5-Normal, levels = design)
fit2 <- limma::contrasts.fit(fit = fit, contrasts = contrast_matrix)
fit2 <- limma::eBayes(fit = fit2)
dmps = lapply(1:5, function(k) {
k_dmps = limma::topTable(fit2,
coef = k,
adjust = "BH",
number = "all")
data.table::setDT(k_dmps, keep.rownames = TRUE)
k_dmps = merge(k_dmps, epic_anno, all.x = TRUE)
k_dmps
})
names(dmps) = paste0("C", 1:5)
sup_table_s5 = lapply(dmps, function(x){
x[adj.P.Val < 0.05][abs(logFC) > 0.20]
})
writexl::write_xlsx(x = sup_table_s5, path = "Supplementary_table_S5.xlsx", col_names = TRUE, format_headers = TRUE)
#DMP definition: FDR < 0.05 and |meth| > 20%
dmp_tbl = lapply(dmps, function(x) {
x = x[adj.P.Val < 0.05][abs(logFC) > 0.20]
data.frame(table(ifelse(
x$logFC > 0, yes = "Hyper", no = "Hypo"
)))
})
dmp_tbl = data.table::rbindlist(l = dmp_tbl, idcol = "Cluster")
dmp_tbl[,freq := Freq / sum(Freq), .(Cluster)]
#DMPs identified
print(data.table::dcast(data = dmp_tbl, Cluster ~ Var1, value.var = 'freq'))
## Cluster Hyper Hypo
## 1: C1 0.6159886 0.3840114
## 2: C2 0.6351961 0.3648039
## 3: C3 0.7749324 0.2250676
## 4: C4 0.7841419 0.2158581
## 5: C5 0.8739028 0.1260972
print(data.table::dcast(data = dmp_tbl, Cluster ~ Var1, value.var = 'Freq'))
## Cluster Hyper Hypo
## 1: C1 34913 21765
## 2: C2 49161 28234
## 3: C3 77921 22631
## 4: C4 98529 27123
## 5: C5 102452 14783
dmp_tbl$Freq_2 = ifelse(test = dmp_tbl$Var1 == "Hypo", yes = -1*as.numeric(dmp_tbl$Freq), dmp_tbl$Freq)
dmp_tbl$freq_2 = ifelse(test = dmp_tbl$Var1 == "Hypo", yes = -1*as.numeric(dmp_tbl$freq), dmp_tbl$freq)
#Plot raw numbers
temp_x_lim = range(dmp_tbl$Freq_2)
dmp_tbl = split(dmp_tbl, as.factor(dmp_tbl$Cluster))
par(mar = c(2, 2, 1, 1))
plot(NA, NA, xlim = range(pretty(temp_x_lim)), ylim = c(0, 5.5), axes = FALSE, xlab = NA, ylab = NA)
abline(h = seq(0.5, 4.5, 1), v = pretty(temp_x_lim), col = grDevices::adjustcolor(col = 'gray70', alpha.f = 0.4))
for(i in 1:length(dmp_tbl)){
temp_x = dmp_tbl[[i]]
rect(xleft = 0, ybottom = i-1, xright = temp_x[Var1 %in% "Hyper", Freq_2], ytop = i, col = "#F16913", border = "white", lwd = 1.2)
rect(xleft = 0, ybottom = i-1, xright = temp_x[Var1 %in% "Hypo", Freq_2], ytop = i, col = "#807DBA", border = "white", lwd = 1.2)
}
axis(side = 1, at = pretty(temp_x_lim), labels = abs(pretty(temp_x_lim)), cex.axis = 0.5)
axis(side = 2, at = seq(0.5, 4.5, 1), labels = names(dmp_tbl), las = 2, tick = FALSE, line = -1)
text(x = 2500, y = 5.2, labels = "Hyper", adj = 0, col = "#F16913", font = 4)
text(x = -1500, y = 5.2, labels = "Hypo", adj = 1, col = "#807DBA", font = 4)
#Genomic annotation
anno_stats = lapply(dmps, function(x){
x$DM = ifelse(x$logFC > 0 , yes = 'Hyper', no = 'Hypo')
x = x[,.N,.(DM, Annotation)]
x[,fract := N/sum(N), .(DM)]
x
})
anno_stats = data.table::rbindlist(l = anno_stats, idcol = 'Cluster')
anno_stats = data.table::dcast(data = anno_stats, Cluster ~ Annotation+DM, value.var = 'fract')
data.table::setDF(x = anno_stats, rownames = anno_stats$Cluster)
anno_stats = anno_stats[,-1]
anno_stats_hypo = t(anno_stats[,-grep(pattern = "Hyper", x = colnames(anno_stats))])
anno_stats_hyper = t(anno_stats[,grep(pattern = "Hyper", x = colnames(anno_stats))])
#CpG island anno
island_stats = lapply(dmps, function(x){
x$DM = ifelse(x$logFC > 0 , yes = 'Hyper', no = 'Hypo')
x = x[,.N,.(DM, Relation_to_Island)]
x[,fract := N/sum(N), .(DM)]
x
})
island_stats = data.table::rbindlist(l = island_stats, idcol = 'Cluster')
island_stats = data.table::dcast(data = island_stats, Cluster ~ Relation_to_Island+DM, value.var = 'fract')
data.table::setDF(x = island_stats, rownames = island_stats$Cluster)
island_stats = island_stats[,-1]
island_stats_hypo = t(island_stats[,-grep(pattern = "Hyper", x = colnames(island_stats))])
island_stats_hyper = t(island_stats[,grep(pattern = "Hyper", x = colnames(island_stats))])
layout(mat = matrix(c(1, 2, 3, 4), nrow = 2, byrow = TRUE), widths = c(1, 1.2), heights = c(0.95, 1))
par(mar = c(1, 3, 1, 1), las = 2)
barplot(height = anno_stats_hyper, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1), axes = FALSE)
barplot(height = anno_stats_hypo, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1.5), axes = FALSE)
legend(x = "topright", col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), legend = gsub(pattern = "_Hypo", replacement = "", x = rownames(anno_stats_hypo)), pch = 15, cex = 0.5)
par(mar = c(3, 3, 0, 1))
barplot(height = island_stats_hyper, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1), axes = FALSE)
axis(side = 1, at = seq(0, 1, 0.2))
barplot(height = island_stats_hypo, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1.5), axes = FALSE)
legend(x = "topright", col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), legend = gsub(pattern = "_Hyper", replacement = "", x = rownames(island_stats_hyper)), pch = 15, cex = 0.6)
axis(side = 1, at = seq(0, 1, 0.2), cex = 0.6)
T-ALL samples 12 T-ALL samples are analyzed, generated as a part of BLUEPRINT cohort. Steps involved in processing: (MACS2 peakcalling -> peak filtering -> SE calling) with chiptk. Super and Typical enhancers are identified using ROSE.
Normal Thymic cell types 5 distinct thymic cell types from healthy thymus were analyzed, generated as a part of BLUEPRINT cohort.
Following analysis are performed:
H3K27ac, H3K4me1, and H3K4me3H3K27ac| Regulatory region | H3K4me1 | H3K27ac | H3K4me3 |
|---|---|---|---|
| Active enhancers | Yes | Yes | NA |
| Poised enhancers | Yes | No | NA |
| Active promoters | NA | NA | Yes |
| Super enhancers | NA | Yes (12kb stitched) | NA |
#SuperEnhancers
se_files = list.files(path = "data/BLUEPRINT/H3K27ac/SE/", pattern = '_AllEnhancers_ENHANCER_TO_GENE.txt', all.files = TRUE, recursive = TRUE, full.names = TRUE)
#K27ac narropeaks
k27ac_files = list.files(path = "data/BLUEPRINT/H3K27ac/narrowPeak/", pattern = '*\\.narrowPeak$', all.files = TRUE, recursive = TRUE, full.names = TRUE)
#bigWig files
bw_files = list.files(path = "data/BLUEPRINT/H3K27ac/bigWigs/", pattern = '_treat_pileup.bw', all.files = TRUE, recursive = TRUE, full.names = TRUE)
k27ac_pdata = data.frame(se_files, bw_files, narrowpeaks = k27ac_files, stringsAsFactors = FALSE)
k27ac_pdata$Sample_Name = paste0(
"sample_",
gsub(
pattern = "_H3K27ac_treat_pileup.bw|b|_treat_pileup.bw",
replacement = "",
x = basename(path = k27ac_pdata$bw_files)
)
)
#T13C is sample649
k27ac_pdata$Sample_Name = gsub(pattern = "T13C", replacement = "649", x = k27ac_pdata$Sample_Name)
k27ac_pdata = merge(k27ac_pdata, pData(tumor_data), all.x = TRUE)
table(k27ac_pdata$Cluster)
##
## C1 C2 C3 C4 C5
## 0 7 0 2 3
#Process bigWigs with bwtool to get the peak intensities (area under the peak)
pdata_act_enh = data.table::copy(x = k27ac_pdata)
pdata_act_enh = pdata_act_enh[pdata_act_enh$Tissue %in% "Leukemic_BM",]
pdata_act_enh = peakSeason::read_coldata(coldata = pdata_act_enh, files_idx = 3, sample_idx = 1)
## Checking for files..
## Done!
#Extract peak intensities for active K27ac peaks (peaks non-overlapping with known TSS)
pdata_act_enh_auc = peakSeason::extract_summary(
coldata = pdata_act_enh,
bed = "data/BLUEPRINT/H3K27ac/narrowPeak_minusTSS/consensus_active_disease_enhancers.bed",
op_dir = tempdir(check = TRUE),
nthreads = 12,
rmAfter = TRUE
)
## Done!
summary_mat = pdata_act_enh_auc$summaries[,6:ncol(pdata_act_enh_auc$summaries)]
summary_mat = data.table::setDF(x = summary_mat)
summary_mat = summary_mat[complete.cases(summary_mat),]
summary_mat = order_by_sds(mat = summary_mat)
#PCA on top 25000 peaks
summary_mat_pca = prcomp(t(summary_mat[1:25000,]))
summary_mat_pca_dat = as.data.frame(summary_mat_pca$x[,c("PC1", "PC2", "PC3", "PC4"), drop= FALSE])
var_explained_k27ac = summary_mat_pca$sdev^2/sum(summary_mat_pca$sdev^2)
summary_mat_pca_dat = cbind(summary_mat_pca_dat, Cluster = pdata_act_enh_auc$cdata$Cluster)
#K4me3 narropeaks
k4me3_files = list.files(path = "data/BLUEPRINT/H3K4me3/narrowPeak/", pattern = '*\\.narrowPeak$', all.files = TRUE, recursive = TRUE, full.names = TRUE)
#bigWig files
bw_files = list.files(path = "data/BLUEPRINT/H3K4me3/bigWigs/", pattern = '_treat_pileup.bw', all.files = TRUE, recursive = TRUE, full.names = TRUE)
k4me3_pdata = data.frame(bw_files, narrowpeaks = k4me3_files, stringsAsFactors = FALSE)
k4me3_pdata$Sample_Name = paste0(
"sample_",
gsub(
pattern = "_H3K4me3_treat_pileup.bw|b|_treat_pileup.bw",
replacement = "",
x = basename(path = k4me3_pdata$bw_files)
)
)
#T13C is sample649
k4me3_pdata$Sample_Name = gsub(pattern = "T13C", replacement = "649", x = k4me3_pdata$Sample_Name)
k4me3_pdata = merge(k4me3_pdata, pData(tumor_data), all.x = TRUE)
table(k4me3_pdata$Cluster)
##
## C1 C2 C3 C4 C5
## 0 7 0 1 3
#Process bigWigs with bwtool to get the peak intensities (area under the peak)
pdata_promoters = data.table::copy(x = k4me3_pdata)
pdata_promoters = pdata_promoters[pdata_promoters$Tissue %in% "Leukemic_BM",]
pdata_promoters = peakSeason::read_coldata(coldata = pdata_promoters, files_idx = 2, sample_idx = 1)
## Checking for files..
## Done!
#Extract peak intensities for k4me3 peaks
pdata_promoters_auc = peakSeason::extract_summary(
coldata = pdata_promoters,
bed = "data/BLUEPRINT/H3k4me3/narrowPeak/consensus_active_disease_promoters.bed",
op_dir = tempdir(check = TRUE),
nthreads = 12,
rmAfter = TRUE
)
## Done!
summary_mat_prom = pdata_promoters_auc$summaries[,6:ncol(pdata_promoters_auc$summaries)]
summary_mat_prom = data.table::setDF(x = summary_mat_prom)
summary_mat_prom = summary_mat_prom[complete.cases(summary_mat_prom),]
summary_mat_prom = order_by_sds(mat = summary_mat_prom)
#PCA on top 25000 peaks
summary_mat_prom_pca = prcomp(t(summary_mat_prom[1:25000,]))
summary_mat_prom_pca_dat = as.data.frame(summary_mat_prom_pca$x[,c("PC1", "PC2", "PC3", "PC4"), drop= FALSE])
var_explained_k4me3 = summary_mat_prom_pca$sdev^2/sum(summary_mat_prom_pca$sdev^2)
summary_mat_prom_pca_dat = cbind(summary_mat_prom_pca_dat, Cluster = pdata_promoters_auc$cdata$Cluster)
par(mfrow = c(1, 2), mar = c(3, 3, 1, 1))
plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(pretty(summary_mat_pca_dat$PC1)), ylim = range(pretty(summary_mat_pca_dat$PC2)))
abline(h = pretty(summary_mat_pca_dat$PC2), v = pretty(summary_mat_pca_dat$PC1), col = grid_cols, lty = 2)
points(x = summary_mat_pca_dat$PC1, y = summary_mat_pca_dat$PC2, bg = clust_cols[summary_mat_pca_dat$Cluster], pch = 21, col = "black")
axis(side = 1, at = pretty(summary_mat_pca_dat$PC1), lwd = 1.5, font = 1, cex.axis = 0.8)
axis(side = 2, at = pretty(summary_mat_pca_dat$PC2), lwd = 1.5, font = 1, las = 2, cex.axis = 0.8)
mtext(text = paste0("PC2 [", round(var_explained_k27ac[2], digits = 2), "]"), side = 2, line = 2, cex = 0.8)
mtext(text = paste0("PC1 [", round(var_explained_k27ac[1], digits = 2), "]"), side = 1, line = 2, cex = 0.8)
title(main = "H3K27ac", adj = 0, cex.main = 0.8)
legend(x = "bottomright", legend = names(clust_cols[c("C1", "C2", "C4", "C5")]), col = clust_cols[c("C1", "C2", "C4", "C5")], pt.bg = clust_cols[1:6], pch = 19, pt.cex = 0.6, cex = 0.6)
plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(pretty(summary_mat_prom_pca_dat$PC1)), ylim = range(pretty(summary_mat_prom_pca_dat$PC2)))
abline(h = pretty(summary_mat_prom_pca_dat$PC2), v = pretty(summary_mat_prom_pca_dat$PC1), col = grid_cols, lty = 2)
points(x = summary_mat_prom_pca_dat$PC1, y = summary_mat_prom_pca_dat$PC2, bg = clust_cols[summary_mat_prom_pca_dat$Cluster], pch = 21, col = "black")
axis(side = 1, at = pretty(summary_mat_prom_pca_dat$PC1), lwd = 1.5, font = 1, cex.axis = 0.8)
axis(side = 2, at = pretty(summary_mat_prom_pca_dat$PC2), lwd = 1.5, font = 1, las = 2, cex.axis = 0.8)
mtext(text = paste0("PC2 [", round(var_explained_k4me3[2], digits = 2), "]"), side = 2, line = 2, cex = 0.8)
mtext(text = paste0("PC1 [", round(var_explained_k4me3[1], digits = 2), "]"), side = 1, line = 2, cex = 0.8)
title(main = "H3K4me3", adj = 0, cex.main = 0.8)
Super enhancers per cluster were identified by merging all BAM files from same cluster follwed by analysis with ROSE.
se_cluster_files = list.files(path = "data/BLUEPRINT/H3K27ac/SE_clusters/", full.names = TRUE, pattern = "_peaks_merged_H3K27ac_AllEnhancers_ENHANCER_TO_GENE.txt")
se_normal_files = k27ac_pdata[is.na(k27ac_pdata$Tissue), "se_files"]
layout(matrix(data = 1:5, nrow = 1, ncol = 5))
temp_var = lapply(X = seq_len(length(se_normal_files)), function(i){
bn = gsub(pattern = "_blacklist_filtered_AllEnhancers_ENHANCER_TO_GENE.txt", replacement = "", x = basename(se_normal_files[i]))
plot_SE(enhToGene = se_normal_files[i], sample = bn)
})
rm(temp_var)
layout(matrix(data = 1:3, nrow = 1, ncol = 3))
temp_var = lapply(X = seq_len(length(se_cluster_files)), function(i){
bn = gsub(pattern = "_peaks_merged_H3K27ac_AllEnhancers_ENHANCER_TO_GENE.txt", replacement = "", x = basename(se_cluster_files[i]))
plot_SE(enhToGene = se_cluster_files[i], sample = bn)
})
rm(temp_var)
Enhancer distribution per sample
se_ntbl = lapply(k27ac_pdata$se_files, function(x ){data.table::fread(x)[, .N, V14]})
names(se_ntbl) = k27ac_pdata$Sample_Name
se_ntbl = data.table::rbindlist(l = se_ntbl, idcol = "Sample_Name")
se_ntbl = merge(se_ntbl, k27ac_pdata[,c("Sample_Name", "Cluster")], by = "Sample_Name")
se_ntbl$Cell_Type = ifelse(is.na(as.character(se_ntbl$Cluster)), yes = unlist(data.table::tstrsplit(x = se_ntbl$Sample_Name, "_", keep = 2)), no = as.character(se_ntbl$Cluster))
se_ntbl$Condition = ifelse(is.na(as.character(se_ntbl$Cluster)), yes = "Normal", no = "Tumor")
colnames(se_ntbl)[2] = "isSuper"
se_ntbl$isSuper = ifelse(test = se_ntbl$isSuper == 1, yes = "SE", no = "TE")
#Sumary table for SE/TE numbers
se_ntbl_sup = data.table::dcast(data = se_ntbl, Sample_Name ~ isSuper, value.var = "N")
se_ntbl_sup = merge(se_ntbl_sup, se_ntbl[,.(Sample_Name, Cell_Type)])[!duplicated(Sample_Name)][order(Cell_Type)]
cell_type_cols = c(clust_cols[c("C2", "C4", "C5")], "CD34" = "#1B9E77", EC = "#D95F02", LC = "#7570B3", SP4 = "#66A61E", SP8 = "#E6AB02")
layout(mat = matrix(c(1, 2), ncol = 2, nrow = TRUE), widths = c(3, 1))
par(mar = c(3, 3, 1, 1))
b = boxplot(N ~ Cell_Type, data = se_ntbl[isSuper == "SE"], xaxt = "n", boxwex = 0.5, outline = TRUE, lty = 1.5, lwd = 1.8, outwex = 0, staplewex = 0, ylim = c(0, 1200), axes = FALSE, border = cell_type_cols)
abline(v = 1:9, h = pretty(c(0, 1200)), lty = 2, col = grid_cols)
axis(side = 1, at = 1:8, labels = names(cell_type_cols), lwd = 1.5, font = 1, las = 2, cex.axis = 0.6)
axis(side = 2, at = pretty(c(0, 1200)), las = 2, lwd = 1.5, font = 1, las = 2, cex.axis = 0.6)
par(mar = c(3, 0, 1, 1))
b = boxplot(N ~ Condition, data = se_ntbl[isSuper == "SE"], xaxt = "n", boxwex = 0.5, outline = TRUE, lty = 1.5, lwd = 1.8, outwex = 0, staplewex = 0, ylim = c(0, 1200), axes = FALSE, border = c("royalblue", "maroon"))
axis(side = 1, at = 1:2, labels = c("Thymic", "T-ALL"), lwd = 1.5, font = 1, las = 2, cex.axis = 0.6)
abline(v = 1:2, h = pretty(c(0, 1200)), lty = 2, col = grid_cols)
#axis(side = 2, at = pretty(c(0, 1200)), las = 2, lwd = 1.5, font = 1, las = 2, cex.axis = 1.8)
t.test(x = se_ntbl[isSuper == "SE"][Condition == 'Tumor', N], y = se_ntbl[isSuper == "SE"][Condition == 'Normal', N])
##
## Welch Two Sample t-test
##
## data: se_ntbl[isSuper == "SE"][Condition == "Tumor", N] and se_ntbl[isSuper == "SE"][Condition == "Normal", N]
## t = 3.4516, df = 14.951, p-value = 0.003575
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 114.1345 482.9655
## sample estimates:
## mean of x mean of y
## 658.75 360.20
dbPath = file.path("data/LOLA/")
regionDB = LOLA::loadRegionDB(dbLocation = dbPath)
## Reading collection annotations:
## In collection 'disease_enhancers', consider adding a 'collection.txt' annotation file.
## In collection 'disease_promoters', consider adding a 'collection.txt' annotation file.
## In collection 'Genomic', consider adding a 'collection.txt' annotation file.
## In collection 'normal_enhancers', consider adding a 'collection.txt' annotation file.
## In collection 'regulatory', consider adding a 'collection.txt' annotation file.
## In collection 'repressors', consider adding a 'collection.txt' annotation file.
## Reading region annotations...
## ::Loading cache:: data/LOLA/disease_enhancers//disease_enhancers_files.RData
## ::Loading cache:: data/LOLA/disease_promoters//disease_promoters_files.RData
## ::Loading cache:: data/LOLA/Genomic//Genomic_files.RData
## ::Loading cache:: data/LOLA/normal_enhancers//normal_enhancers_files.RData
## ::Loading cache:: data/LOLA/regulatory//regulatory_files.RData
## ::Loading cache:: data/LOLA/repressors//repressors_files.RData
## disease_enhancers
## ::Loading cache:: data/LOLA/disease_enhancers/disease_enhancers.RData
## disease_promoters
## ::Loading cache:: data/LOLA/disease_promoters/disease_promoters.RData
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
## - in 'x': chr17_gl000205_random, chrUn_gl000219, chrUn_gl000220, chr7_gl000195_random, chrUn_gl000218, chrUn_gl000225
## - in 'y': chrY, chr17_gl000204_random
## Make sure to always combine/compare objects based on the same reference
## genome (use suppressWarnings() to suppress this warning).
## Genomic
## ::Loading cache:: data/LOLA/Genomic/Genomic.RData
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
## - in 'x': chrUn_gl000225, chr17_gl000204_random
## - in 'y': chrM, chr17_ctg5_hap1, chr19_gl000209_random, chr1_gl000191_random, chr4_ctg9_hap1, chr4_gl000193_random, chr4_gl000194_random, chr6_apd_hap1, chr6_cox_hap2, chr6_dbb_hap3, chr6_mann_hap4, chr6_mcf_hap5, chr6_qbl_hap6, chr6_ssto_hap7, chrUn_gl000211, chrUn_gl000212, chrUn_gl000213, chrUn_gl000215, chrUn_gl000222, chrUn_gl000223, chrUn_gl000224, chrUn_gl000227, chrUn_gl000228, chrUn_gl000241
## Make sure to always combine/compare objects based on the same reference
## genome (use suppressWarnings() to suppress this warning).
## normal_enhancers
## ::Loading cache:: data/LOLA/normal_enhancers/normal_enhancers.RData
## regulatory
## ::Loading cache:: data/LOLA/regulatory/regulatory.RData
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
## - in 'x': chr17_gl000205_random, chrUn_gl000219, chrUn_gl000220, chr7_gl000195_random, chrUn_gl000218, chrUn_gl000225, chr17_ctg5_hap1, chr19_gl000209_random, chr1_gl000191_random, chr4_ctg9_hap1, chr4_gl000194_random, chr6_apd_hap1, chr6_cox_hap2, chr6_dbb_hap3, chr6_mann_hap4, chr6_mcf_hap5, chr6_qbl_hap6, chr6_ssto_hap7, chrUn_gl000211, chrUn_gl000213, chrUn_gl000215, chrUn_gl000222, chrUn_gl000223, chrUn_gl000224, chrUn_gl000227, chrUn_gl000228, chrUn_gl000241
## - in 'y': chr11_gl000202_random
## Make sure to always combine/compare objects based on the same reference
## genome (use suppressWarnings() to suppress this warning).
## repressors
## ::Loading cache:: data/LOLA/repressors/repressors.RData
#All hyper DMPs as a background
bg_loci_all_dmps = data.table::rbindlist(l = dmps)[adj.P.Val < 0.05 & abs(logFC) > 0.2][!duplicated(rn)][,.(Chr, Start, end = End+1)]
bg_loci = GenomicRanges::makeGRangesFromDataFrame(df = bg_loci_all_dmps)
rm(bg_loci_all_dmps)
dmp_dat_gr = lapply(dmps, function(x) GenomicRanges::makeGRangesFromDataFrame(df = x[adj.P.Val < 0.05 & logFC > 0.2][,.(Chr, Start, end = End+1)]))
lola_res_hyper = LOLA::runLOLA(userSets = dmp_dat_gr, userUniverse = bg_loci, regionDB = regionDB, cores = 2)
## Calculating unit set overlaps...
## Calculating universe set overlaps...
## Calculating Fisher scores...
lola_res_hyper$pValueLog = ifelse(test = is.infinite(x = lola_res_hyper$pValueLog), yes = -log10(.Machine$double.xmin), no = lola_res_hyper$pValueLog)
lola_res_hyper$qValue = p.adjust(10^-lola_res_hyper$pValueLog, "fdr")
lola_res_hyper$significant = ifelse(test = lola_res_hyper$qValue < 0.01, yes = "Yes", no = "No")
#Plot LOLA results
lola_res_hyper$filename = factor(x = lola_res_hyper$filename, levels = unique(lola_res_hyper[order(collection, filename)][,filename]))
dmp_dat_gr = lapply(dmps, function(x) GenomicRanges::makeGRangesFromDataFrame(df = x[adj.P.Val < 0.05 & logFC < -0.2][,.(Chr, Start, end = End+1)]))
lola_res_hypo = LOLA::runLOLA(userSets = dmp_dat_gr, userUniverse = bg_loci, regionDB = regionDB, cores = 2)
## Calculating unit set overlaps...
## Calculating universe set overlaps...
## Calculating Fisher scores...
lola_res_hypo$pValueLog = ifelse(test = is.infinite(x = lola_res_hypo$pValueLog), yes = -log10(.Machine$double.xmin), no = lola_res_hypo$pValueLog)
lola_res_hypo$qValue = p.adjust(10^-lola_res_hypo$pValueLog, "fdr")
lola_res_hypo$significant = ifelse(test = lola_res_hypo$qValue < 0.01, yes = "Yes", no = "No")
lola_res_hypo$filename = factor(x = lola_res_hypo$filename, levels = unique(lola_res_hypo[order(collection, filename)][,filename]))
lola_res_tbl = data.table::rbindlist(list(Hyper = lola_res_hyper, Hypo = lola_res_hypo), idcol = "Status")
lola_res_tbl$wrap = ifelse(test = lola_res_tbl$filename %like% "SE", yes = "SE", no = ifelse(lola_res_tbl$filename %like% "TE", yes = "TE", no = lola_res_tbl$description))
lola_res_tbl[, filename := gsub(pattern = ".hg19|.bed", replacement = "", x = lola_res_tbl$filename)]
lola_res_tbl$filename = factor(x = lola_res_tbl$filename, levels = rev(c("Thymic_active_enhancers", "Thymic_primed_enhancers", "Thymic_putative_enhancers", "Thymic_active_promoters", "Thymus_K27Me3", "CD34_SE", "EC_SE", "LC_SE", "SP4_SE", "SP8_SE", "CD34_TE", "EC_TE", "LC_TE", "SP4_TE", "SP8_TE", "Promoters", "Intergenic", "TALL_K27Me3", "C2_consensus_K4me3", "C4_consensus_K4me3", "C5_consensus_K4me3", "C2_SE", "C4_SE", "C5_SE", "C2_TE", "C4_TE", "C5_TE")))
NOTE Below plot might look slightly different based on the LOLA version.
library(ggplot2)
ggplot(data = lola_res_tbl, aes(y = filename, x = userSet, fill = oddsRatio, color = significant))+coord_fixed()+geom_point(aes(size = pValueLog), pch =21)+theme_minimal(base_size = 10)+scale_fill_gradient(low = "gray", high = "black")+scale_color_manual(values = c(No = "darkgreen", Yes = "red"))+xlab("Cluster")+facet_wrap(~Status)+ylab(label = element_blank())
Motif analysis was done using homer findMotifsGenome.pl for hypo DMPs (+/- 100bp) that are located within SE/TE regions from T-ALL. Results from homer - knownResults.txt are stored in data/homer_motifs directory.
#P-value threshold for homer motif results
pthr = 1e-5
kr_files = list.files(path = "data/homer_motifs/", pattern = "knownResults.txt", recursive = TRUE, full.names = TRUE)
homer_res_dat = lapply(kr_files, data.table::fread)
names(homer_res_dat) = paste0("C", 1:5)
homer_motifs = lapply(homer_res_dat, function(x) {
m = x[`P-value` <= pthr]
if (length(m) == 0) {
return(NULL)
}
colnames(m) = c("Motif_Name", "Consensus", "P_value", "Log_P_value", "Q_value", "n_targets", "fract_targets", "n_bg_targets", "fract_bg_targets")
m
})
homer_motifs = data.table::rbindlist(l = homer_motifs, idcol = "Cluster", fill = TRUE, use.names = TRUE)
homer_motifs$Motif = unlist( data.table::tstrsplit(x = homer_motifs$Motif_Name, split = "/", keep = 1))
homer_motifs$Motif_name = data.table::tstrsplit(x = homer_motifs$Motif, split = "\\(", keep = 1)
homer_motifs$Motif_family = data.table::tstrsplit(x = homer_motifs$Motif, split = "\\(", keep = 2)
homer_motifs$Motif_family = gsub(x = homer_motifs$Motif_family, pattern = "\\)$", replacement = "")
homer_motifs_ptbl = data.table::dcast(data = homer_motifs, Motif_Name ~ Cluster, value.var = "P_value")
writexl::write_xlsx(x = homer_motifs_ptbl, path = "Supplementary_table_S7.xlsx", col_names = TRUE)
#After discussing with Aurore following motifs are decided to be shown. Rest goto to supplemental table.
homer_motifs$Motif_name = toupper(x = homer_motifs$Motif_name) #Convert to upper cases (these are consevred across all vertebrates - double checked manually)
homer_motifs_tar = homer_motifs[Motif_name %in% c("CEBP", "CHOP", "HLF", "NFIL3", "ATF4", "AP-1", "PU.1", "ETS1", "FLI1", "NF1", "RUNX1", "MYB", "TAL1", "LEF1", "E2A", "HEB", "TCF4", "BORIS", "GATA1", "GATA2", "GATA3", "GATA4", "GATA6", "HOXD11", "HOXD13", "HOXB13", "HOXA13", "HOXA11")]
homer_motifs_tar[, id := paste0(Cluster, "_", Motif_name)]
homer_motifs_tar = homer_motifs_tar[!duplicated(id)]
homer_motifs_tar = data.table::dcast(data = homer_motifs_tar, Motif_name ~ Cluster, value.var = 'Log_P_value')
homer_motifs_tar_plot_dat = merge(homer_motifs_tar, homer_motifs[,.(Motif_name, Motif_family)], by = "Motif_name")
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[!duplicated(Motif_name)]
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[order(Motif_family, decreasing = TRUE)]
data.table::setDF(x = homer_motifs_tar_plot_dat, rownames = homer_motifs_tar_plot_dat$Motif_name)
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[,-1]
#Re-order
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[c("CEBP", "CHOP", "HLF", "NFIL3", "ATF4", "AP-1", "PU.1", "ETS1", "FLI1", "NF1", "RUNX1", "MYB", "TAL1", "LEF1", "E2A", "HEB", "TCF4", "BORIS", "GATA1", "GATA2", "GATA3", "GATA4", "GATA6", "HOXD11", "HOXD13", "HOXB13", "HOXA13", "HOXA11"),]
fam_order = homer_motifs_tar_plot_dat$Motif_family
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[,-6]
homer_motifs_tar_plot_dat[is.na(homer_motifs_tar_plot_dat)] = 0
homer_motifs_tar_plot_dat[homer_motifs_tar_plot_dat < 0] = 1
homer_motifs_tar_plot_dat = as.matrix(homer_motifs_tar_plot_dat)
lo = matrix(data = 1:3, nrow = 1, ncol = 3)
layout(mat = lo, widths = c(1, 1, 2))
par(mar = c(0, 0, 0, 0))
plot(NA, xlim = c(0, 1), ylim = c(1, nrow(homer_motifs_tar_plot_dat)), axes = FALSE, xlab = NA, ylab = NA)
text(x = 0.5, y = 1:nrow(homer_motifs_tar_plot_dat), labels = rownames(homer_motifs_tar_plot_dat), font = 3)
par(mar = c(0, 0, 0, 0))
plot(NA, xlim = c(0, 1), ylim = c(1, nrow(homer_motifs_tar_plot_dat)), axes = FALSE, xlab = NA, ylab = NA)
text(x = 0.5, y = 1:nrow(homer_motifs_tar_plot_dat), labels = fam_order, cex = 0.8)
par(mar = c(0, 0, 0, 0))
plot(NA, xlim = c(1, ncol(homer_motifs_tar_plot_dat)), ylim = c(1, nrow(homer_motifs_tar_plot_dat)), axes = FALSE, xlab = NA, ylab = NA)
for(i in seq_len(ncol(homer_motifs_tar_plot_dat))){
x = homer_motifs_tar_plot_dat[,i]
for(xi in seq_along(x)){
if(x[xi] == 0){
points(x = i, y = xi, pch = 21, col = "black")
}else{
points(x = i, y = xi, pch = 21, bg = "#02075d", col = 'black')
}
}
}
counts = read.delim(file = "data/BLUEPRINT_expression/count_matrix.tsv.gz", row.names = 1)
anno = read.delim("data/BLUEPRINT_expression/pheno_data.tsv.gz", row.names = 1)
counts = counts[,rownames(anno)]
if(!all(rownames(anno) == colnames(counts))){
stop("Sample order do not match!")
}
#All measured genes
dim(counts)
## [1] 50572 48
#Restricting all ananlysis to protein coding genes
ens_genes = data.table::fread(input = "data/extdata/ens_gene_biotypes.tsv", header = FALSE)
ens_genes = ens_genes[V4 %in% "protein_coding"][V1 %in% 1:22]
counts = counts[rownames(counts) %in% ens_genes$V3,] #Protein coding genes
#All protein coding genes
dim(counts)
## [1] 16624 48
dds = DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = anno, design = ~Platform+Cluster)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq2::DESeq(object = dds, parallel = TRUE)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## -- replacing outliers and refitting for 351 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
vsd = DESeq2::vst(object = dds, blind = FALSE)
vsd_log_mat = order_by_sds(mat = as.data.frame(assay(vsd)))
batch correction on VSD (for visualization)
mod = model.matrix(~as.factor(Cluster), data=colData(vsd))
vsd_batch_corrected = sva::ComBat(dat = assay(vsd), batch = vsd$Platform, par.prior = TRUE, mod = mod)
## Found 2360 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found3batches
## Adjusting for5covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
#vsd_limma = limma::removeBatchEffect(x = assay(vsd), batch = vsd$Platform) #Using limma
vsd_batch_corrected = order_by_sds(as.data.frame(vsd_batch_corrected))
vsd_pca = prcomp(x = t(vsd_batch_corrected[1:1000,]))
vsd_pca_dat = as.data.frame(vsd_pca$x[,c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), drop= FALSE])
vsd_pca_var_explained = vsd_pca$sdev^2/sum(vsd_pca$sdev^2)
vsd_pca_dat = cbind(vsd_pca_dat, colData(vsd))
vsd_pca_dat$clust_color = clust_cols[vsd_pca_dat$Cluster]
#layout(mat = matrix(c(1, 2), nrow = 2, ncol = 1), heights = c(4, 2))
par(mar = c(3, 3, 1, 1))
plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = c(-50, 50), ylim = range(pretty(vsd_pca_dat$PC2)))
abline(h = pretty(vsd_pca_dat$PC2), v = pretty(vsd_pca_dat$PC1), col = "gray90", lty = 2)
vsd_pca_dat$clust_color = ifelse(test = is.na(vsd_pca_dat$clust_color), yes = "gray70", no = vsd_pca_dat$clust_color)
points(x = vsd_pca_dat$PC1, y = vsd_pca_dat$PC2, col = "black", bg = vsd_pca_dat$clust_color, pch = 21, cex = 1)
axis(side = 1, at = seq(-50, 50, 25), cex.axis = 0.8)
axis(side = 2, at = pretty(vsd_pca_dat$PC2), las = 2, cex.axis = 0.8)
mtext(text = paste("PC2 ", round(vsd_pca_var_explained[2], digits = 2)), side = 2, line = 2, cex = 0.7)
mtext(text = paste("PC1 ", round(vsd_pca_var_explained[1], digits = 2)), side = 1, line = 2, cex = 0.7)
#plot.new()
#par(mar = c(0, 0, 0, 0))
legend(x = "topleft", legend = names(clust_cols), col = clust_cols, pch = 19, xpd = TRUE, ncol = 1, cex = 0.6)
coldata_tumor = anno[!anno$Cluster %in% "Normal",]
dim(coldata_tumor)
## [1] 44 58
hmanno = coldata_tumor[,c("TLX1", "TLX3", "TAL1", "HOXA_cis", "HOXA_trans", "HOXA_nd", "ETP_Phenotype", "DNMT3A", "IDH2", "Cohort", "Cluster")]
hmanno$TAL1 = as.character(hmanno$TAL1)
hmanno$TLX3 = as.character(hmanno$TLX3)
hmanno$TLX1 = as.character(hmanno$TLX1)
#hmanno$CALM_AF10 = as.character(hmanno$CALM_AF10)
hmanno$HOXA_cis = as.character(hmanno$HOXA_cis)
hmanno$HOXA_trans = as.character(hmanno$HOXA_trans)
hmanno$HOXA_nd = as.character(hmanno$HOXA_nd)
hmanno$ETP_Phenotype = ifelse(test = as.character(hmanno$ETP_Phenotype) == "Yes", yes = "1", no = "0")
hmanno$DNMT3A = ifelse(test = as.character(hmanno$DNMT3A) == "Mutant", yes = "1", no = "0")
hmanno$IDH2 = ifelse(test = as.character(hmanno$IDH2) == "Mutant", yes = "1", no = "0")
yesno_cols = c("1" = "maroon", "0" = "white")
#tall_genes = c("TLX1", "TLX3", "HOXA9", "LYL1", "LMO1", "LMO2", "TAL1", "TAL2", "NKX2-1", "HOXA11")
tall_genes = c("TLX1", "TLX3", "HOXA9", "TAL1")
tumsamps = rownames(coldata_tumor)
hmanno$Cohort = NULL
pheatmap::pheatmap(vsd_log_mat[tall_genes, tumsamps], annotation_col = hmanno, scale = 'row', show_colnames = FALSE, annotation_colors = list(pred = clust_cols, ETP_Phenotype = yesno_cols, TAL1 = yesno_cols, TLX3 = yesno_cols, TLX1 = yesno_cols, CALM_AF10 = yesno_cols, HOXA_cis = yesno_cols, HOXA_trans = yesno_cols, HOXA_nd = yesno_cols, DNMT3A = yesno_cols, IDH2 = yesno_cols, Cluster = clust_cols))
#
DEGs: Clusters v/s Thymus
clusters = paste0("C", 1:5)
res = lapply(clusters, function(r){
rds = results(object = dds, contrast = c("Cluster", r, "Normal"), lfcThreshold = 1)
#rds <- lfcShrink(dds, contrast = c("Cluster", r, "Th"), res = rds, type = "normal")
rds = as.data.table((as.data.frame(rds)), keep.rownames = TRUE)
rds[order(padj)]})
names(res) = clusters
sup8_data = lapply(res, function(x){
x[padj < 0.1]
})
writexl::write_xlsx(x = sup8_data, path = "Supplementary_table_S8.xlsx", col_names = TRUE, format_headers = TRUE)
significantGenesUpTbl = sapply(res, function(x){
table(cut(x[padj < 0.1][log2FoldChange > 0][,log2FoldChange], breaks = c(seq(0, 15, 3), 50)))
})
significantGenesDownTbl = sapply(res, function(x){
table(cut(x[padj < 0.1][log2FoldChange < 0][,log2FoldChange], breaks = c(-50, seq(0, -15, -3))))
})
par(mar = c(3, 3, 1, 1))
b1 = barplot(height = significantGenesUpTbl, ylim = c(-1000, 1000), col = brewer.pal(8,"Oranges"), names.arg = rep(NA, ncol(significantGenesUpTbl)), axes = FALSE, border = "#34495e")
b2 = barplot(height = -significantGenesDownTbl[6:1,], add = TRUE, col = brewer.pal(8,"Blues"), names.arg = rep(NA, ncol(significantGenesUpTbl)), axes = FALSE, border = "#34495e")
text(x = b1, y = colSums(x = significantGenesUpTbl)+50, labels = colSums(x = significantGenesUpTbl), cex = 0.7, xpd = TRUE)
text(x = b2, y = -(colSums(x = significantGenesDownTbl)+50), labels = colSums(x = significantGenesDownTbl), cex = 0.7)
mtext(text = colnames(significantGenesUpTbl), side = 1, at = b1, las = 1, cex = 0.7)
mtext(text = "DEGs", side = 2, line = 2, cex = 0.8)
axis(side = 2, at = seq(-1000, 1000, by = 250), las = 2, cex.axis = 0.7)
image(
x = c(0.2, 0.3),
y = seq(700, 1000, length.out = 4),
z = matrix(1:4, nrow = 1),
col = brewer.pal(4,"Oranges"), add = TRUE, xlim = c(1, 8), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA
)
text(x = 0.35, y = seq(700, 1000, length.out = 4), labels = rownames(significantGenesUpTbl), adj = 0, xpd = TRUE, cex = 0.6)
image(
x = c(0.2, 0.3),
y = seq(-1000, -700, length.out = 4),
z = matrix(1:4, nrow = 1),
col = rev(brewer.pal(4,"Blues")), add = TRUE, xlim = c(1, 8), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA
)
text(x = 0.35, y = seq(-1000, -700, length.out = 4), labels = rownames(significantGenesDownTbl), adj = 0, xpd = TRUE, cex = 0.6)
epic_anno = data.table::copy(fData(object = meth_exprs))
data.table::setDT(epic_anno)
prom_probes = epic_anno[Annotation %in% 'promoter-TSS'][,.(rn, Hugo_Symbol)]
temp_meth = as.data.table(as.data.frame(exprs(meth_exprs)), keep.rownames = TRUE)
temp_meth = merge(prom_probes, temp_meth, by = 'rn')
temp_meth[,rn := NULL]
prom_meth = temp_meth[,lapply(.SD, mean, na.rm = TRUE), by = Hugo_Symbol]
data.table::setDF(x = prom_meth, rownames = prom_meth$Hugo_Symbol)
prom_meth$Hugo_Symbol = NULL
prom_meth = Biobase::ExpressionSet(
assayData = as.matrix(prom_meth),
phenoData = Biobase::AnnotatedDataFrame(pData(meth_exprs))
)
rm(temp_meth, prom_probes)
#Common genes covered by expression and methylation
com_genes = intersect(rownames(prom_meth), rownames(vsd_batch_corrected))
tum_samples = colnames(dds[,dds$Cluster != "Normal"])
Three analysis to be checked:
temp_anno = as.data.frame(colData(vsd))
lo = layout(matrix(data = 1:10, nrow = 5, ncol = 2, byrow = TRUE))
temp = lapply(names(res), function(cl){
de = res[[cl]]
cl_samps = rownames(temp_anno[temp_anno$Cluster %in% cl, ])
de_upg = intersect(de[padj < 0.1 & log2FoldChange > 0, rn], com_genes) #up-regulated DEGs
de_dng = intersect(de[padj < 0.1 & log2FoldChange < 0, rn], com_genes)#down-regulated DEGs
down_e = vsd_batch_corrected[de_dng,cl_samps]
down_e = down_e[complete.cases(down_e),]
up_e = vsd_batch_corrected[de_upg,cl_samps]
up_e = up_e[complete.cases(up_e),]
down_m = exprs(prom_meth[de_dng,cl_samps])
down_m = down_m[complete.cases(down_m),]
up_m = exprs(prom_meth[de_upg,cl_samps])
up_m = up_m[complete.cases(up_m),]
par(mar = c(2, 2, 1, 1))
boxplot(apply(down_e, 1, mean), apply(up_e, 1, mean), ylim = c(4, 16), horizontal = TRUE, notch = TRUE, col = c("#16a085", "#c0392b"))
pe = round(t.test(apply(down_e, 1, mean), apply(up_e, 1, mean), alternative = "l")$p.value, 2)
if(pe < 0.001){
text(x = 16, y = 1.5, labels = "***", srt = 90)
}else{
text(x = 16, y = 1.5, labels = "ns", srt = 90)
}
par(mar = c(2, 2, 1, 1))
boxplot(apply(down_m, 1, median), apply(up_m, 1, median), ylim = c(0, 1), horizontal = TRUE, notch = TRUE, col = c("#16a085", "#c0392b"))
pm = round(t.test(apply(down_m, 1, median), apply(up_m, 1, median), alternative = "g")$p.value, 2)
if(pm < 0.001){
text(x = 1, y = 1.5, labels = "***", srt = 90)
}else{
text(x = 1, y = 1.5, labels = "ns", srt = 90)
}
})
m = as.data.frame(exprs(prom_meth[com_genes, tum_samples]))
e = vsd_batch_corrected[com_genes, tum_samples]
if(!all(colnames(m) == colnames(e))){
stop("Sample names mismatch!")
}
if(!all(rownames(m) == rownames(e))){
stop("Genes names mismatch!")
}
em_cor = lapply(seq_len(nrow(m)), function(i){
cor.test(x = as.numeric(m[i,]), y = as.numeric(e[i,]), method = "spearman", alternative = "two.sided", exact = FALSE)
})
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
## Warning in cor(rank(x), rank(y)): the standard deviation is zero
names(em_cor) = rownames(m)
em_cor_summary = lapply(em_cor, function(x){
data.table::data.table(p_val = unlist(x$p.value), estimate = unlist(x$estimate))
})
em_cor_summary = data.table::rbindlist(em_cor_summary, idcol = "Hugo_Symbol")
em_cor_summary = em_cor_summary[!is.na(estimate)]
em_cor_summary$p_adjusted = p.adjust(em_cor_summary$p_val, method = "hochberg")
em_cor_summary = em_cor_summary[order(p_adjusted)]
dim(em_cor_summary)
## [1] 15889 4
par(mar = c(4, 3, 2, 1))
hist(em_cor_summary$estimate, breaks = 30, col = "#7f8c8d", border = "white", xlim = c(-1, 1), main = NA, xlab = NA, ylab = NA)
hist(em_cor_summary[p_adjusted < 0.1, estimate], breaks = 50, border = "white", add = TRUE, col = "#c0392b")
legend(x = "topright", legend = c("FDR < 0.1"), col = "#c0392b", pch= 15, bty = "n")
mtext(text = "Spearman correlation", side = 1, line = 2.5)
#mtext(text = "Density", side = 2, line = 2)
#text(x = -0.8, y = 250, labels = paste0("N = ", nrow(em_cor_summary[p_adjusted < 0.1])))
genes2heihlight = list(C1 = c("AZU1", "CSF3", "EGFL7", "FES", "JDP2", "S100A6", "SLC2A5"), C2 = c("CD160", "CD47", "FOSL1"), C3 = c("MAP3K8", "MYEOV"), C4 = c("NPTX1"), C5 = c("CAPG", "EMP1", "FAM83A", "LGALS1", "NCR1", "RGS17", "TEC", "ALS2CL2", "AMPH", "CMTM8", "DEPDC7", "HOOK1", "MITF", "MPPED2", "PCDH9", "PKNOX2", "PARRES1", "RASEF", "RNF180", "S100A16", "SLFN5"))
par(mfrow = c(1, 5), mar = c(2, 2, 1, 1))
prom_meth_diff = lapply(names(dmps), function(x){
#Mean lfc of all promoter associated probes
prom_lfc = dmps[[x]][Annotation %in% "promoter-TSS"][,mean(logFC), .(Hugo_Symbol)]
prom_lfc = merge(prom_lfc, res[[x]][,.(rn, log2FoldChange, padj)], by.x = 'Hugo_Symbol', by.y = 'rn') #Merge with correpsoiind expression LFC
prom_lfc = prom_lfc[order(padj)]
yat = seq(-30, 30, 10)#pretty(prom_lfc[,log2FoldChange])
plot(NA, xlim = c(-1, 1), ylim = range(yat), axes = FALSE)
abline(h = yat, v = seq(-1, 1, 0.25), lty = 2, col = "#ecf0f1")
points(prom_lfc$V1, prom_lfc$log2FoldChange, pch = 19, col = '#ecf0f1', cex = 0.6)
points(prom_lfc[padj < 0.1, V1], prom_lfc[padj < 0.1, log2FoldChange], pch = 19, col = '#95a5a6', cex = 0.6)
axis(side = 1, at = seq(-1, 1, 0.25))
axis(side = 2, at = yat)
#legend(x = "topleft", pch = 19, col = "black", legend = paste("FDR < 0.1 [", nrow(prom_lfc[padj < 0.1]), "]"), bty = "n", xpd = TRUE)
title(main = x, adj = 0)
# qlen = quad_lens(prom_lfc[padj < 0.1, V1], prom_lfc[padj < 0.1, log2FoldChange])
# #qpos = c("topright", "topleft", "bottomleft", "bottomright")
# for(pos_idx in seq_len(length(qlen))){
# legend(x = names(qlen)[pos_idx], legend = paste0("N = ", qlen[[pos_idx]]), bty = "n")
# }
colnames(prom_lfc) = c("Hugo_Symbol", "meth_diff", "exprs_lfc", "exprs_padj")
prom_lfc$Quadrant = ifelse(
test = prom_lfc$meth_diff > 0 &
prom_lfc$exprs_lfc < 0,
yes = "Q4",
no = ifelse(
test = prom_lfc$meth_diff < 0 &
prom_lfc$exprs_lfc > 0,
yes = "Q2",
no = "Q1/Q3"
)
)
prom_lfc$is_sig = ifelse(test = prom_lfc$exprs_padj < 0.1, yes = "yes", no = "no")
x_mark_genes = prom_lfc[Hugo_Symbol %in% genes2heihlight[[x]]]
points(x_mark_genes$meth_diff, x_mark_genes$exprs_lfc, pch = 19, col = 'maroon', cex = 0.6)
#text(x = x_mark_genes$meth_diff, y = x_mark_genes$exprs_lfc, labels = x_mark_genes$Hugo_Symbol, adj = 1, cex = 0.6)
if(nrow(x_mark_genes[Quadrant %in% "Q2"]) > 0){
q2marks = x_mark_genes[Quadrant %in% "Q2"]
text(x = -0.6, y = 30, labels = paste(q2marks$Hugo_Symbol, collapse = "\n"), adj = 0, cex = 0.7, xpd = TRUE)
}
if(nrow(x_mark_genes[Quadrant %in% "Q4"]) > 0){
q4marks = x_mark_genes[Quadrant %in% "Q4"]
text(x = 0.6, y = -20, labels = paste(q4marks$Hugo_Symbol, collapse = "\n"), adj = 1, cex = 0.7, xpd = TRUE)
}
prom_lfc
})
names(prom_meth_diff) = names(dmps)
#writexl::write_xlsx(x = prom_meth_diff, path = paste0(res_dir, "DMPs_prom_meth_exprs.xlsx"), col_names = T, format_headers = T)
surv_dat = pData(tumor_data)[,c("Sample_Name", "Overall_Survival", "Overall_Survival_Status", "Disease_Free_Survival", "Disease_Free_Survival_Status", "Cluster")]
#surv_dat$Disease_Free_Survival = surv_dat$Disease_Free_Survival/365
#surv_dat$Overall_Survival = surv_dat$Overall_Survival/365]
surv_cols = clust_cols[c("C1", "C2", "C3", "C4", "C5")]
#Overall survival
os_cluster = survival::survfit(survival::Surv(Overall_Survival, Overall_Survival_Status) ~ Cluster, surv_dat)
surv_diff = survival::survdiff(formula = survival::Surv(time = Overall_Survival, event = Overall_Survival_Status) ~ Cluster, data = surv_dat)
surv_diff_pval = signif(1 - pchisq(surv_diff$chisq, length(surv_diff$n) - 1), digits = 3)
#Disease free survival
dfs_cluster = survival::survfit(survival::Surv(Disease_Free_Survival, Disease_Free_Survival_Status) ~ Cluster, surv_dat)
df_surv_diff = survival::survdiff(formula = survival::Surv(time = Disease_Free_Survival, event = Disease_Free_Survival_Status) ~ Cluster, data = surv_dat)
df_surv_diff_pval = signif(1 - pchisq(df_surv_diff$chisq, length(df_surv_diff$n) - 1), digits = 3)
Survival plots in the manuscript are shown with risk table. However, here plots are shown without risk table.
par(mar = c(3, 4, 2, 1), mfrow = c(1, 2), cex = 0.7)
plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2000, 500), col = 'gray80', lty = 2)
lines(os_cluster, col = surv_cols, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
#text(x = 500, y = 0.2, labels = paste0("P = " , round(surv_diff_pval, digits = 3)))
legend(x = "bottomright", legend = names(surv_cols), col = surv_cols, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Overall Survival", adj = 0)
#plot(os_cluster, col = surv_cols, xlim = c(0, 2000), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)
plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2600, 500), col = 'gray80', lty = 2)
lines(dfs_cluster, col = surv_cols, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
#text(x = 500, y = 0.2, labels = paste0("P = " , round(df_surv_diff_pval, digits = 3)))
legend(x = "bottomright", legend = names(surv_cols), col = surv_cols, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Event Free Survival", adj = 0)
surv_dat$Cluster = as.character(surv_dat$Cluster)
surv_dat$three_group = ifelse(test = surv_dat$Cluster %in% c("C1", "C2"), yes = "C12", no = ifelse(test = surv_dat$Cluster %in% c("C3", "C4"), yes = "C34", no = "C5"))
print(table(surv_dat$three_group))
##
## C12 C34 C5
## 48 63 32
#Overall survival
os_cluster = survival::survfit(survival::Surv(Overall_Survival, Overall_Survival_Status) ~ three_group, surv_dat)
surv_diff = survival::survdiff(formula = survival::Surv(time = Overall_Survival, event = Overall_Survival_Status) ~ three_group, data = surv_dat)
surv_diff_pval = signif(1 - pchisq(surv_diff$chisq, length(surv_diff$n) - 1), digits = 3)
#Disease free survival
dfs_cluster = survival::survfit(survival::Surv(Disease_Free_Survival, Disease_Free_Survival_Status) ~ three_group, surv_dat)
df_surv_diff = survival::survdiff(formula = survival::Surv(time = Disease_Free_Survival, event = Disease_Free_Survival_Status) ~ three_group, data = surv_dat)
df_surv_diff_pval = signif(1 - pchisq(df_surv_diff$chisq, length(df_surv_diff$n) - 1), digits = 3)
clust3_col = c("#807DBA", "#238B45", "#F16913")
names(clust3_col) = c("C12", "C34", "C5")
par(mar = c(3, 4, 2, 1), mfrow = c(1, 2), cex = 0.7)
plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2600, 500), col = 'gray80', lty = 2)
lines(os_cluster, col = clust3_col, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
legend(x = "bottomright", legend = names(clust3_col), col = clust3_col, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Overall Survival", adj = 0)
#plot(os_cluster, col = surv_cols, xlim = c(0, 2000), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)
plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2600, 500), col = 'gray80', lty = 2)
lines(dfs_cluster, col = clust3_col, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
legend(x = "bottomright", legend = names(clust3_col), col = clust3_col, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Disease Free Survival", adj = 0)
Clinical characteristics of xenographed samples
xeno = maftools::read.maf(maf = "data/targetted_NGS/xeno.maf", clinicalData = "data/targetted_NGS/xeno_anno.csv", verbose = FALSE, vc_nonSyn = c("Mutated", "Amplification", "Deletion"), removeDuplicatedVariants = FALSE)
maftools::oncoplot(maf = xeno, colors = vc_color_codes, clinicalFeatures = c("Cluster", "Maturation_Arrest", "ETP_Phenotype", "Oncogenic_TF"), drawColBar = FALSE, drawRowBar = FALSE, annotationFontSize = 0.8, showTumorSampleBarcodes = TRUE, anno_height = 1.75, legendFontSize = 1, showTitle = FALSE, fontSize = 0.5, SampleNamefontSize = 0.7)
Excel sheet data/mice_experiment.xlsx contains results from 5-Aza treatment on mice models including survival, blast counts, and tumor engraftment (measured as a function of bioluminiscence (photons per second)) over the time course.
Xenografts
#Color codes
bg_col = grDevices::adjustcolor(col = 'gray', alpha.f = 0.6)
col_codes = c('curatively' = '#D95F02', 'untreated' = '#1B9E77', 'preventively' = '#E7298A')
lo = layout(mat = matrix(c(1:10), nrow = 2, ncol = 5, byrow = FALSE))
mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 1)
title(main = "UPNT-M149 (C3)", adj = 0, cex.main = 0.8)
mtext(text = "Surv. probability", side = 2, cex = 0.8, line = 1.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 2)
mtext(text = "% BLASTS", side = 2, cex = 0.8, line = 1.8)
mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 3)
title(main = "UPNT-670 (C4)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 4)
mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 5)
title(main = "UPNT-529 (C5)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 6)
mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 7)
title(main = "UPNT-M525 (C2)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 8)
mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 9)
title(main = "UPNT-M894 (C2)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 10)
ALL-SIL cell line
par(mfrow = c(1, 2))
mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 11)
allsil_biolumi = readxl::read_xlsx(path = "data/mice_experiment.xlsx", sheet = 12)
data.table::setDT(x = allsil_biolumi)
allsil_biolumi$days = as.numeric(as.character(allsil_biolumi$days))
allsil_biolumi = data.table::melt(data = allsil_biolumi, id.vars = "days")
allsil_biolumi$value = as.numeric(as.character(allsil_biolumi$value))
allsil_biolumi$treatment = data.table::tstrsplit(x = allsil_biolumi$variable, split = " ", keep = 1)
allsil_biolumi = allsil_biolumi[!is.na(value)]
allsil_biolumi[,value := log10(value)]
allsil_biolumi_stat = allsil_biolumi[,.(mean = mean(value), sem = sd(value)/.N), .(treatment, days)]
allsil_biolumi_stat[, sem_up := mean + sem]
allsil_biolumi_stat[, sem_low := mean - sem]
xrange = pretty(range(allsil_biolumi_stat$days))
yrange = pretty(range(allsil_biolumi_stat$sem_up))
allsil_biolumi_stat = split(allsil_biolumi_stat, as.factor(allsil_biolumi_stat$treatment))
par(mar = c(3, 3, 1, 1))
plot(NA, NA, xlim = range(xrange), ylim = range(yrange), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
abline(h = yrange, v = xrange, col = bg_col, lty = 2)
axis(side = 1, at = xrange, labels = xrange, lwd = 1, cex.axis = 0.8)
axis(side = 2, at = yrange, labels = yrange, las = 2, lwd = 1, cex.axis = 0.8)
mtext(text = "Days", side = 1, line = 2, font = 1, cex = 1)
mtext(text = "log10(photon/sec)", side = 2, line = 2, font = 1, cex = 0.8)
for(x in allsil_biolumi_stat){
points(x = x$days, y = x$mean, col = col_codes[x$treatment], type = 'l', lwd = 1.8)
segments(x0 = x$days, y0 = x$sem_low, x1 = x$days, y1 = x$sem_up, col = "black", lwd = 1.8)
points(x = x$days, y = x$mean, col = 'black', pch = 21, bg = col_codes[x$treatment], cex = 1)
}
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Pop!_OS 18.04 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] org.Hs.eg.db_3.12.0 AnnotationDbi_1.52.0
## [3] gtable_0.3.0 scales_1.1.1
## [5] peakSeason_0.1.0 ggplot2_3.3.3
## [7] RColorBrewer_1.1-2 ComplexHeatmap_2.6.2
## [9] circlize_0.4.12 pheatmap_1.0.12
## [11] survival_3.2-7 umap_0.2.7.0
## [13] fossil_0.4.0 shapefiles_0.7
## [15] foreign_0.8-80 maps_3.3.0
## [17] sp_1.4-5 NMF_0.23.0
## [19] cluster_2.1.0 rngtools_1.5
## [21] pkgmaker_0.32.2 registry_0.5-1
## [23] ape_5.4-1 LOLA_1.20.0
## [25] maftools_2.6.05 goseq_1.42.0
## [27] geneLenDataBase_1.26.0 BiasedUrn_1.07
## [29] sva_3.38.0 BiocParallel_1.24.1
## [31] genefilter_1.72.1 mgcv_1.8-33
## [33] nlme_3.1-149 DESeq2_1.30.1
## [35] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [37] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [39] GenomicRanges_1.42.0 GenomeInfoDb_1.26.2
## [41] IRanges_2.24.1 S4Vectors_0.28.1
## [43] BiocGenerics_0.36.0 limma_3.46.0
## [45] readxl_1.3.1 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] BiocFileCache_1.14.0 plyr_1.8.6 splines_4.0.2
## [4] gridBase_0.4-7 digest_0.6.27 foreach_1.5.1
## [7] htmltools_0.5.1.1 GO.db_3.12.1 fansi_0.4.2
## [10] magrittr_2.0.1 memoise_2.0.0 doParallel_1.0.16
## [13] Biostrings_2.58.0 annotate_1.68.0 R.utils_2.10.1
## [16] askpass_1.1 prettyunits_1.1.1 colorspace_2.0-0
## [19] blob_1.2.1 rappdirs_0.3.3 xfun_0.21
## [22] dplyr_1.0.4 crayon_1.4.1 RCurl_1.98-1.2
## [25] jsonlite_1.7.2 iterators_1.0.13 glue_1.4.2
## [28] zlibbioc_1.36.0 XVector_0.30.0 GetoptLong_1.0.5
## [31] DelayedArray_0.16.2 shape_1.4.5 abind_1.4-5
## [34] DBI_1.1.1 edgeR_3.32.1 Rcpp_1.0.6
## [37] xtable_1.8-4 progress_1.2.2 clue_0.3-58
## [40] reticulate_1.18 bit_4.0.4 httr_1.4.2
## [43] ellipsis_0.3.1 farver_2.1.0 R.methodsS3_1.8.1
## [46] pkgconfig_2.0.3 XML_3.99-0.5 sass_0.3.1
## [49] dbplyr_2.1.0 simpleCache_0.4.1 locfit_1.5-9.4
## [52] utf8_1.1.4 labeling_0.4.2 tidyselect_1.1.0
## [55] rlang_0.4.10 reshape2_1.4.4 munsell_0.5.0
## [58] cellranger_1.1.0 tools_4.0.2 cachem_1.0.4
## [61] generics_0.1.0 RSQLite_2.2.3 evaluate_0.14
## [64] stringr_1.4.0 fastmap_1.1.0 yaml_2.2.1
## [67] knitr_1.31 bit64_4.0.5 beanplot_1.2
## [70] purrr_0.3.4 R.oo_1.24.0 xml2_1.3.2
## [73] biomaRt_2.46.3 compiler_4.0.2 curl_4.3
## [76] png_0.1-7 tibble_3.1.0 geneplotter_1.68.0
## [79] bslib_0.2.4 stringi_1.5.3 highr_0.8
## [82] GenomicFeatures_1.42.1 RSpectra_0.16-0 lattice_0.20-41
## [85] Matrix_1.3-2 vctrs_0.3.6 pillar_1.5.0
## [88] lifecycle_1.0.0 jquerylib_0.1.3 GlobalOptions_0.1.2
## [91] bitops_1.0-6 qvalue_2.22.0 rtracklayer_1.50.0
## [94] R6_2.5.0 writexl_1.3.1 codetools_0.2-16
## [97] assertthat_0.2.1 rjson_0.2.20 openssl_1.4.3
## [100] withr_2.4.1 GenomicAlignments_1.26.0 Rsamtools_2.6.0
## [103] GenomeInfoDbData_1.2.4 berryFunctions_1.19.1 hms_1.0.0
## [106] rmarkdown_2.7 Cairo_1.5-12.2